url_link = read_html("http://insideairbnb.com/get-the-data.html")
all_links = url_link %>%
html_nodes('a') %>% # find links
html_attr("href") %>% # get the url
str_subset("bristol") %>%
str_subset("2021-03-20")
# download file
for (i in 1:length(all_links)){
print(i)
filename = paste0("file", "_", i, ".zip")
file = download.file(all_links[i], filename)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
myfiles <- list.files(pattern="file.*zip")
file = lapply(myfiles, read_csv)
# inspect data
file_1 = file[[1]]
View(file_1)
glimpse(file_1)
file_1 %>% select_if(is.Date) %>% arrange(calendar_last_scraped) #inspect date
file_1 %>% count(id) %>% filter(n > 1)
We can see some listings with no reviews (NA values).
file 2: Listings information (calendar-based)
# inspect data
file_2 = file[[2]]
View(file_2)
glimpse(file_2)
file_3 = file[[3]]
View(file_3)
glimpse(file_3)
file_4 = file[[4]]
View(file_4)
glimpse(file_4)
file_5 = file[[5]]
View(file_5)
glimpse(file_5)
file_6 = file[[6]]
View(file_6)
glimpse(file_6)
set.seed(123)
file_1 %>% select(description) %>% sample_n(5)
## # A tibble: 5 x 1
## description
## <chr>
## 1 The bedroom has 2 single beds, which can put put together to form a double be~
## 2 Spacious Victorian terraced house with free on street parking situated close ~
## 3 My place is close to restaurants/cafes, Bristol University, St Michaels hospi~
## 4 This beautifully presented one bedroom ground floor flat is perfectly located~
## 5 A self contained annex walking distance to the now famous and vibrant Glouces~
From the samples above, we clearly observe that we need to remove some characters in markup language e.g.,
, , •.
Next, let’s explore common descriptions.
# explore descriptions
file_1 %>%
group_by(description) %>%
summarize(n_description =n()) %>%
mutate(pct=n_description /sum(n_description)) %>%
arrange(-n_description) %>%
top_n(10,n_description)
## # A tibble: 15 x 3
## description n_description pct
## <chr> <int> <dbl>
## 1 <NA> 28 0.0181
## 2 "Brooks Guest House Bristol is a boutique guesthouse, ~ 10 0.00648
## 3 "centre of Bristol and less than 10 minutes’ walk fro~ 6 0.00389
## 4 "In one of the city centre's prime locations right on ~ 6 0.00389
## 5 "This city building enjoys a prime location on Welsh B~ 6 0.00389
## 6 "[\"If you're looking for a modern, stylish studio liv~ 5 0.00324
## 7 "The COVID19 pandemic has made us all understand the i~ 4 0.00259
## 8 "This stylish studio apartment situated on the first ~ 4 0.00259
## 9 "<U+25CF>Spa:Bristol Marriott Royal Hotel<br /><br /><U+25CF>Lodging~ 3 0.00194
## 10 "A studio flat close to public transport, Bristol Univ~ 3 0.00194
## 11 "Offering a fabulous setting in the heart of Bristol, ~ 3 0.00194
## 12 "Our amazing and modern apartments is the perfect plac~ 3 0.00194
## 13 "Rowan Tree is a selection of boutique Serviced Apartm~ 3 0.00194
## 14 "The COVID19 pandemic has made us all understand the i~ 3 0.00194
## 15 "This stylish studio apartment situated on the second~ 3 0.00194
set.seed(1234)
file_1 %>% select(neighborhood_overview) %>% sample_n(5)
## # A tibble: 5 x 1
## neighborhood_overview
## <chr>
## 1 North Street is a very vibrant location with plenty of bars, restaurants and ~
## 2 SACO Bristol – West India House is a Grade II listed Edwardian building locat~
## 3 Clifton Short Lets offers a variety of rooms within shared properties in the ~
## 4 Elegant, leafy and filled with chic boutiques, cosy cafes, beautiful building~
## 5 On your doorstep you have Bristol's shopping outlet. Cabot Circus is a unique~
file_1 %>%
group_by(neighborhood_overview) %>%
summarize(n_overview =n()) %>%
mutate(pct=n_overview /sum(n_overview)) %>%
arrange(-n_overview) %>%
top_n(3,n_overview)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 3
## neighborhood_overview n_overview pct
## <chr> <int> <dbl>
## 1 <NA> 418 0.271
## 2 Our apartment is situated just couple of minutes’ walk fro~ 13 0.00842
## 3 This is in the great neighbourhood of Easton, with local p~ 12 0.00777
set.seed(1234)
file_3 %>% select(comments) %>% sample_n(5)
## # A tibble: 5 x 1
## comments
## <chr>
## 1 Nice and clean apartment. Very good communication.
## 2 Second time of staying here. Excellent communication as always and annexe beu~
## 3 Location of apartment was so central and easy to get to everything and Lee , ~
## 4 This apartment is very stylish, attractive and well appointed, in a wonderful~
## 5 Very clean and comfortable space with large garden and lovely touches like sn~
file_3 %>%
group_by(comments) %>%
summarize(n_overview =n()) %>%
mutate(pct=n_overview /sum(n_overview)) %>%
arrange(-n_overview) %>%
top_n(3,n_overview)
## # A tibble: 3 x 3
## comments n_overview pct
## <chr> <int> <dbl>
## 1 . 72 0.00105
## 2 Great location 51 0.000742
## 3 Great place 47 0.000684
# rename
file_3 = file_3 %>% rename(unique_id = id)
# joining reviews with listing table using left join
file_3 %>%
left_join(file_1, by = c("listing_id" = "id" )) -> df_1
# clean up the memory from the other files
rm(file)
rm(file_1)
rm(file_2)
## Warning in rm(file_2): object 'file_2' not found
rm(file_3)
rm(file_4)
## Warning in rm(file_4): object 'file_4' not found
rm(file_5)
## Warning in rm(file_5): object 'file_5' not found
rm(file_6)
## Warning in rm(file_6): object 'file_6' not found
df_1$lang_desc = cld3::detect_language(df_1$description)
df_1$lang_nb = cld3::detect_language(df_1$neighborhood_overview)
df_1$lang_rv = cld3::detect_language(df_1$comments)
df_1$lang_desc1 = textcat(df_1$description)
df_1$lang_nb1 = textcat(df_1$neighborhood_overview)
df_1$lang_rv1 = textcat(df_1$comments)
# explore the efficiency of detecting languages
df_1 %>% count(lang_rv, lang_rv1) %>% filter(lang_rv == 'en') # we can still see non-english
# select relevant variables
df_full = df_1 %>%
filter(date < '2020-01-03' & lang_desc == 'en' & lang_nb == 'en' & lang_rv == 'en' & lang_desc1 == 'english' & lang_nb1 == 'english' & lang_rv1 == 'english') %>%
select(listing_id:comments, name, description, neighborhood_overview, neighbourhood_cleansed, host_id, accommodates, host_since, host_response_rate, host_response_time, host_acceptance_rate, host_is_superhost, host_total_listings_count, host_has_profile_pic, host_identity_verified, property_type, room_type, bathrooms_text, bathrooms, beds, bedrooms, amenities, price, number_of_reviews, number_of_reviews_ltm, number_of_reviews_l30d, reviews_per_month, starts_with("review_scores_"), instant_bookable)
# let's create a column using the number of characters
df_full$review_length_chars <- nchar(df_full$comments)
# statistical summary
summary(df_full$review_length_chars)
# identifying outliers using IQR
ggplot(df_full, aes(x=review_length_chars)) + geom_histogram() + labs(x="Review character length", y="Frequency", subtitle="Distribution of Review Character Length")
outliers1 <- boxplot(df_full$review_length_chars, ylab = "Review Length")$out
# drop the rows containing outliers
df_full <- df_full[-c(which(df_full$review_length_chars %in% outliers1)),]
# plot a boxplot without outliers
boxplot(df_full$review_length_chars, ylab = "Review Length")
# plot a histogram
hist(df_full$review_length_chars,breaks = 100,main = "Review Length(All)")
# plot a histogram
df_full %>% group_by(listing_id) %>%
summarize(total = n()) %>% ggplot(aes(x=total)) + geom_histogram() + labs(x="Total number of listings", y="Frequency", subtitle="Distribution of The Number of Listings")
# create unique id
df_full <- df_full %>% mutate(rev_id = row_number())
# check null values
NAcols <- which(colSums(is.na(df_full)) > 0)
sort(colSums(sapply(df_full[NAcols], is.na)), decreasing = TRUE)
length(NAcols)
# check if null values are from the same ids
na_inspect = df_full[is.na(df_full$host_since),]
df_full %>% filter(listing_id %in% unlist(na_inspect %>% select(listing_id)))
# before removing null values, replace N/A that is currently character
for (i in 1:ncol(df_full)){
print(i)
if (is.character(df_full[[i]])){
df_full[[i]] = str_replace(df_full[[i]], pattern = "N/A", NA_character_)
}
}
# read the data
df_full = readRDS("df_full.rds")
# check character columns
df_full %>% select_if(is.character)
## # A tibble: 38,869 x 14
## reviewer_name comments name description neighborhood_ov~ neighbourhood_c~
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Neralee "We lov~ City~ A good siz~ The neighbourho~ Windmill Hill
## 2 Terry "Sarah ~ City~ A good siz~ The neighbourho~ Windmill Hill
## 3 Alice "Exactl~ City~ A good siz~ The neighbourho~ Windmill Hill
## 4 Thomas "Sarah ~ City~ A good siz~ The neighbourho~ Windmill Hill
## 5 Dimitrije "The ex~ City~ A good siz~ The neighbourho~ Windmill Hill
## 6 Thomas "Sarah ~ City~ A good siz~ The neighbourho~ Windmill Hill
## 7 Eva "Sarah ~ City~ A good siz~ The neighbourho~ Windmill Hill
## 8 Eleanor "The ho~ City~ A good siz~ The neighbourho~ Windmill Hill
## 9 Liz "Sarah ~ City~ A good siz~ The neighbourho~ Windmill Hill
## 10 Clara "I spen~ City~ A good siz~ The neighbourho~ Windmill Hill
## # ... with 38,859 more rows, and 8 more variables: host_response_rate <chr>,
## # host_response_time <chr>, host_acceptance_rate <chr>, property_type <chr>,
## # room_type <chr>, bathrooms_text <chr>, amenities <chr>, price <chr>
# remove unrelated variables and drop columns with null values
df_2 = df_full %>%
select(-c("bathrooms", "reviewer_name")) %>%
drop_na(c("neighbourhood_cleansed", "description", "comments")) %>%
mutate(comments_senti = comments, description_senti = description)
# check null values
which(colSums(is.na(df_2)) > 0)
## host_since host_response_rate
## 12 13
## host_response_time host_acceptance_rate
## 14 15
## host_is_superhost host_total_listings_count
## 16 17
## host_has_profile_pic host_identity_verified
## 18 19
## bedrooms review_scores_rating
## 24 31
## review_scores_accuracy review_scores_cleanliness
## 32 33
## review_scores_checkin review_scores_communication
## 34 35
## review_scores_location review_scores_value
## 36 37
# explore the data
glimpse(df_2)
## Rows: 38,869
## Columns: 42
## $ listing_id <dbl> 70820, 70820, 70820, 70820, 70820, 7082...
## $ unique_id <dbl> 8163487, 8397676, 8706695, 13024371, 14...
## $ date <date> 2013-10-18, 2013-10-28, 2013-11-12, 20...
## $ reviewer_id <dbl> 7538116, 9601180, 9697980, 2388955, 163...
## $ comments <chr> "We loved our stay with Sarah in Bristo...
## $ name <chr> "City View - Sarah's double room.", "Ci...
## $ description <chr> "A good sized room with a comfy double ...
## $ neighborhood_overview <chr> "The neighbourhood is friendly and welc...
## $ neighbourhood_cleansed <chr> "Windmill Hill", "Windmill Hill", "Wind...
## $ host_id <dbl> 360195, 360195, 360195, 360195, 360195,...
## $ accommodates <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ host_since <date> 2011-01-24, 2011-01-24, 2011-01-24, 20...
## $ host_response_rate <chr> "100%", "100%", "100%", "100%", "100%",...
## $ host_response_time <chr> "within a day", "within a day", "within...
## $ host_acceptance_rate <chr> "75%", "75%", "75%", "75%", "75%", "75%...
## $ host_is_superhost <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU...
## $ host_total_listings_count <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU...
## $ host_identity_verified <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU...
## $ property_type <chr> "Private room in townhouse", "Private r...
## $ room_type <chr> "Private room", "Private room", "Privat...
## $ bathrooms_text <chr> "1 shared bath", "1 shared bath", "1 sh...
## $ beds <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ bedrooms <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ amenities <chr> "[\"Hangers\", \"Free street parking\",...
## $ price <chr> "$30.00", "$30.00", "$30.00", "$30.00",...
## $ number_of_reviews <dbl> 153, 153, 153, 153, 153, 153, 153, 153,...
## $ number_of_reviews_ltm <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...
## $ number_of_reviews_l30d <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ reviews_per_month <dbl> 1.69, 1.69, 1.69, 1.69, 1.69, 1.69, 1.6...
## $ review_scores_rating <dbl> 95, 95, 95, 95, 95, 95, 95, 95, 95, 95,...
## $ review_scores_accuracy <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...
## $ review_scores_cleanliness <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...
## $ review_scores_checkin <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...
## $ review_scores_communication <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...
## $ review_scores_location <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,...
## $ review_scores_value <dbl> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...
## $ instant_bookable <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ review_length_chars <int> 415, 163, 170, 315, 574, 123, 291, 315,...
## $ rev_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ...
## $ comments_senti <chr> "We loved our stay with Sarah in Bristo...
## $ description_senti <chr> "A good sized room with a comfy double ...
# change text columns to be lower letters
df_2$neighbourhood_cleansed = tolower(df_2$neighbourhood_cleansed)
df_2$description = tolower(df_2$description)
df_2$comments = tolower(df_2$comments)
# create a function extracting only numbers
numextract <- function(string){
str_extract(string, "\\-*\\d+\\.*\\d*")
}
# set a cut-off date
df_2$enddate = as.Date("2020-02-29")
# plot a histogram
hist(df_2$review_scores_rating)
# feature engineering
df_2 %>% mutate(rating_group = as.numeric(cut(df_2$review_scores_rating, breaks = c(0,75,80,85,90,95,100))),
ops_year = as.numeric(((enddate - host_since) / 365)),
amen_items = (str_count(amenities, pattern = ",") + 1),
price_num = as.numeric(numextract(price)),
response_rate = as.numeric(str_replace(host_response_rate, "%", "")),
response_time = factor(host_response_time, levels = c("wihtin an hour", "within a few hours", "within a day", "a few days or more")),
acceptance_rate = as.numeric(str_replace(host_acceptance_rate, "%", "")),
host_is_superhost = as.factor(host_is_superhost),
host_has_profile_pic = as.factor(host_has_profile_pic),
host_identity_verified = as.factor(host_identity_verified),
bathrooms = as.numeric(numextract(bathrooms_text)),
instant_bookable = as.factor(instant_bookable),
neighbourhood = as.factor(neighbourhood_cleansed),
year = strftime(date, format = "%Y"),
month = strftime(date, format = "%m"),
day = strftime(date, format = "%d")) %>%
select(-c(host_since, amenities, price, host_response_rate,
host_response_time,host_acceptance_rate, bathrooms_text,
neighbourhood_cleansed)) -> df
# check correlation between price and the number of tenants
cor(df$price_num, df$accommodates, method = "pearson", use = "complete.obs")
## [1] 0.7493011
# check the average for the number of tenants
mean(df$accommodates, na.rm = T)
## [1] 3.035941
# let's create a new variable using adjustment concept for price
df = df %>% mutate(price_adj = (price_num / accommodates) * mean(df$accommodates, na.rm = T))
# plot a histogram
hist(df$price_adj)
# check correlation again
cor(df$price_adj, df$accommodates, method = "pearson", use = "complete.obs")
## [1] -0.1307732
# visualize scatter plot to inspect the relationship
df %>% ggplot(aes(accommodates, price_adj)) + geom_point() + geom_smooth(method="lm") + labs(subtitle="Adjusted Price per Person by The Number of Tenants", x="Number of tenants", y="Adjusted price per person")
## `geom_smooth()` using formula 'y ~ x'
# create a function for text cleaning
cleanse_text = function(c) {
c = iconv(c)
c = gsub("[[:punct:][:blank:]]+", " ", c)
c = stringr::str_replace_all(c, "\r", "")
c = stringr::str_replace_all(c, "\n", "")
c = trimws(c)
return(c)
}
# create a function for text cleaning used for part B: sentiment analysis
cleanse_text_withpunc = function(c) {
c = iconv(c)
c = gsub("[[:blank:]]+", " ", c)
c = stringr::str_replace_all(c, "\r", "")
c = stringr::str_replace_all(c, "\n", "")
c = trimws(c)
return(c)
}
# cleanse texts
df$comments = cleanse_text(df_2$comments)
df$description = cleanse_text(df_2$description)
df$comments_ori = cleanse_text_withpunc(df_2$comments_senti)
df$description_ori = cleanse_text_withpunc(df_2$description_senti)
# inspect comments
set.seed(1234)
df %>% select(comments) %>% sample_n(10) %>% pull()
# inspect comments with punctuation marks
set.seed(1234)
df %>% select(comments_ori) %>% sample_n(10) %>% pull()
rm(df_1)
rm(df_2)
saveRDS(df, file = "df.rds")
saveRDS(df_full, file = "df_full.rds")
rm(df_full)
langmodel_download <- udpipe::udpipe_download_model("english")
langmodel <- udpipe::udpipe_load_model(langmodel_download$file_model)
postagged <- udpipe_annotate(langmodel,
df$comments,
parallel.cores = 8,
trace = 5000)
postagged <- as.data.frame(postagged)
head(postagged)
lematized <- postagged %>% filter(upos %in% c("NOUN",
"ADJ",
"ADV")) %>% select(doc_id,lemma) %>% group_by(doc_id) %>% summarise(documents_pos_tagged = paste(lemma,collapse = " "))
# create unique id
df2 <- df %>% mutate(doc_id = paste0("doc",row_number()))
# combine tables
df2 <- df2 %>% left_join(lematized)
# save the working file as .rds
saveRDS(df2,file = "df2.rds")
token_df_all <- df2 %>% unnest_tokens(word,documents_pos_tagged)
# check the language again and remove non-English words
token_df_all$lan <- cld3::detect_language(token_df_all$word)
token_df_all <- token_df_all %>% filter(lan=="en")
# check the language again and remove non-English words
spell_check = hunspell_check(token_df_all$word)
token_df_all = token_df_all[spell_check,]
#saveRDS(token_df,file = "token_df.rds")
saveRDS(token_df_all,file = "token_df_all.rds")
# read data
token_df_all = readRDS("token_df_all.rds")
# calculate the token length
token_df_all$token_length <- nchar(token_df_all$word)
# let's have a look on the distribution
token_df_all %>% group_by(token_length) %>% summarise(total =n()) %>% ggplot(aes(x=token_length, y=total)) + geom_col() + labs(x="Token length", y="Frequency", subtitle="Distribution of Token Length")
## `summarise()` ungrouping output (override with `.groups` argument)
# let's have a look at the distribution of tokens again
token_df_all %>% group_by(token_length) %>%
summarise(total =n()) %>%
arrange(desc(token_length))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 14 x 2
## token_length total
## <int> <int>
## 1 15 7
## 2 14 352
## 3 13 295
## 4 12 477
## 5 11 668
## 6 10 3638
## 7 9 6193
## 8 8 2296
## 9 7 8803
## 10 6 9405
## 11 5 33111
## 12 4 27015
## 13 3 9195
## 14 2 931
-We can remove some texts if the length is too long. – From the plot below, the length is right-skewed and it is observed that there are only few words that have more length than 14. – However, this is possibly useful for the analysis. Hence, we decide to keep them.
token_df_all %>% count(word, sort=T)
## # A tibble: 833 x 2
## word n
## <chr> <int>
## 1 great 20690
## 2 stay 8803
## 3 bed 3969
## 4 highly 3809
## 5 space 3369
## 6 time 3277
## 7 here 2809
## 8 just 2617
## 9 welcoming 2545
## 10 welcome 2509
## # ... with 823 more rows
# remove null values in words
token_df_all = token_df_all[!is.na(token_df_all$word),]
# load stop words
data("stop_words")
token_df2 = token_df_all %>% anti_join(stop_words, by = "word")
# retrieve host name from available data
file = lapply(myfiles, read_csv)
hostname = file[[4]]
hostname<-hostname %>%
filter(str_count(host_name,boundary("word"))==1 ) %>%
unique() %>%
as_tibble()
hostname_vector = as_tibble(unique(hostname$host_name))
# rename a column name
colnames(hostname_vector) = "word"
# generate new stop words
mystopwords <- tibble(word =c("bristol", "city", "stay", "airbnb", "clifton", "windmill", "easton", "redland", "southmead", "hotwells", "harbourside","ashley", "henbury", "george", "southville","eastville", "bishopston", "knowle", "lockleaze", "filwood", "cotham", "henleaze", "bishopsworth","sarah"))
# combine stop words
mystopwords<-mystopwords %>% bind_rows(hostname_vector)
mystopwords$word<-tolower(mystopwords$word)
# combine data sets
token_df2 = token_df2 %>% anti_join(mystopwords, by = "word")
# explore top words across corpus
token_df2 %>% group_by(word) %>%
summarise(total =n()) %>%
arrange(desc(total)) %>%
top_n(10)
## `summarise()` ungrouping output (override with `.groups` argument)
## Selecting by total
## # A tibble: 10 x 2
## word total
## <chr> <int>
## 1 bed 3969
## 2 highly 3809
## 3 space 3369
## 4 time 3277
## 5 welcoming 2545
## 6 minute 1846
## 7 touch 1701
## 8 day 1605
## 9 central 1462
## 10 town 1447
# save file
saveRDS(token_df2,file = "token_df2.rds")
listing_words = token_df2 %>% count(listing_id, word, sort=T)
total_words <- listing_words %>%
group_by(listing_id) %>%
summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
listing_words <- listing_words %>%
left_join(total_words)
## Joining, by = "listing_id"
listing_tf_idf <- listing_words %>% bind_tf_idf(word,listing_id,n) %>% select(-total)
# plot a histogram
hist(listing_tf_idf$tf_idf,breaks = 80,main="TF-IDF plot", xlim = c(0,1))
listing_tf_idf <- listing_tf_idf %>%
filter(tf_idf<=0.3)
# plot a histogram
hist(listing_tf_idf$tf_idf,breaks = 80,main="TF-IDF plot", xlim = c(0,0.2))
listing_tf_idf_2 <- listing_tf_idf %>%
filter(tf_idf<=0.2)
# statistical summary
summary(listing_tf_idf_2$tf_idf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001185 0.013738 0.024093 0.035282 0.043620 0.199999
# let's explore the top 10 words with the highest tf-idf
listing_tf_idf_2 %>%
group_by(word) %>%
arrange(desc(tf_idf)) %>%
top_n(10)
## Selecting by tf_idf
## # A tibble: 3,214 x 6
## # Groups: word [687]
## listing_id word n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 8529713 greatly 1 0.0714 2.80 0.200
## 2 33211242 mead 1 0.111 1.79 0.199
## 3 38016691 lively 1 0.1 1.99 0.199
## 4 40813734 possibly 1 0.1 1.99 0.199
## 5 37483548 feature 1 0.0833 2.39 0.199
## 6 26635769 coffees 1 0.0526 3.77 0.199
## 7 39057103 efficiently 1 0.0526 3.77 0.199
## 8 6406525 valuable 1 0.05 3.97 0.198
## 9 24217586 pleasantly 1 0.0625 3.17 0.198
## 10 34960416 smell 1 0.0667 2.97 0.198
## # ... with 3,204 more rows
# let's explore the top 10 words with the lowest tf-idf
listing_tf_idf_2 %>%
group_by(word) %>%
arrange(tf_idf) %>%
top_n(10)
## Selecting by tf_idf
## # A tibble: 3,214 x 6
## # Groups: word [687]
## listing_id word n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 4086139 catering 1 0.00123 4.41 0.00544
## 2 4086139 hiccup 1 0.00123 4.41 0.00544
## 3 4086139 curry 1 0.00123 4.53 0.00559
## 4 4086139 ridiculously 1 0.00123 4.53 0.00559
## 5 4086139 exercise 1 0.00123 4.66 0.00575
## 6 4086139 independence 1 0.00123 4.66 0.00575
## 7 4086139 page 1 0.00123 4.66 0.00575
## 8 4086139 delivery 1 0.00123 4.81 0.00594
## 9 4086139 father 1 0.00123 4.81 0.00594
## 10 4086139 problematic 1 0.00123 4.81 0.00594
## # ... with 3,204 more rows
neighbour_words <- token_df2 %>%
count(neighbourhood, word, sort = TRUE)
# word count for each neighbourhood
total_neighbour_words <- neighbour_words %>%
group_by(neighbourhood) %>%
summarize(total = sum(n))
# left join
neighbour_words <- neighbour_words %>%
left_join(total_neighbour_words)
# visualization of the frequency
neighbour_words %>%
mutate(tf = n/total) %>%
ggplot(aes(x=tf,fill=neighbourhood)) +
geom_histogram(show.legend = FALSE)+
facet_wrap(~neighbourhood,ncol=10,scales = "free_y")
neighbour_words %>%
group_by(neighbourhood) %>%
mutate(rank = row_number(),
tf = n/total) %>%
ungroup() -> zipf_data
# plot using Zipf's law
zipf_data %>%
ggplot(aes(rank, tf, color = neighbourhood)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
neighbour_tf_idf <- neighbour_words %>%
bind_tf_idf(word,neighbourhood,n) %>%
select(-total)
neighbour_tf_idf <- neighbour_tf_idf %>%
filter(tf_idf>0.002)
neighbour_tf_idf %>%
ggplot(aes(tf_idf)) +
geom_histogram() +
xlim(0,0.02)
# create own function to wrap texts
swr = function(string, nwrap=20) {
paste(strwrap(string, width=nwrap), collapse="\n")
}
swr = Vectorize(swr)
neighbour_tf_idf %>%
group_by(neighbourhood) %>%
slice_max(tf_idf,n=5,with_ties = F) %>%
ungroup() %>%
mutate(neighbourhood = swr(neighbourhood), row = row_number(), word2 = fct_reorder(word, tf_idf)) -> n
n %>%
ggplot(aes(tf_idf, word2, fill = neighbourhood)) +
geom_col(show.legend = FALSE) +
facet_wrap(~neighbourhood, ncol = 5, scales = "free") +
labs(x = "tf-idf", y = "Top 5 Words")
month_words <- token_df2 %>%
count(month, word, sort = TRUE)
# word count for each month
total_month_words <- month_words %>%
group_by(month) %>%
summarize(total = sum(n))
# left join
month_words <- month_words %>%
left_join(total_month_words)
# visualization of the frequency
month_words %>%
mutate(tf = n/total) %>%
ggplot(aes(x=tf,fill=month))+
geom_histogram(show.legend = FALSE)+
xlim(0,0.0009)+
facet_wrap(~month,ncol=5,scales = "free_y")
From the above plot, we have the similar pattern of words with the same tf values across months - from July to October.
Let’s apply Zipf’s law.
month_words %>%
group_by(month) %>%
mutate(rank = row_number(),
tf = n/total) %>%
ungroup() -> zipf_data
# plot
zipf_data %>%
ggplot(aes(rank, tf, color = month)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
From Zipf’s plot, we can see that words that are distinctive from each month are at the rank higher than 10. Hence, we will filter data to remove common or popular words.
TF-IDF by month
month_tf_idf <- month_words %>%
bind_tf_idf(word,month,n) %>%
select(-total)
# explore the distribution
month_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram(bins = 50)
month_tf_idf <- month_tf_idf %>%
filter(tf_idf>0.0003)
month_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram()
#rank the words and calculate the dominant words
month_tf_idf %>%
group_by(month)%>%
arrange(desc(tf_idf)) %>%
slice_max(tf_idf,n=7,with_ties = F) %>%
ungroup() %>%
ggplot(aes(tf_idf, reorder(word, tf_idf), fill = month)) +
geom_col(show.legend = FALSE) +
facet_wrap(~month, ncol = 6, scales = "free", labeller =labeller(month = c("01" = "Jan", "02" = "Feb", "03" = "Mar", "04" = "Apr", "05" = "May", "06" = "Jun", "07" = "Jul", "08" = "Aug", "09" = "Sep", "10" = "Oct", "11" = "Nov", "12" = "Dec"))) +
labs(x = "tf-idf", y = "Top 7 Words")
# explore the distribution of listing price
token_df2 %>% ggplot(aes(x=price_num)) + geom_histogram(bins = 100) + xlim(c(0,400))
summary(token_df2$price_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 38.00 53.00 71.23 80.00 843.00
# let's equally allocate the data into 7 groups
token_df2$price_group <- as.numeric(cut(token_df2$price_num, breaks = c(0,30,45,60,75,90,150,800)))
token_df2 %>% group_by(price_group) %>% summarize(mean = mean(price_num), count = n())
## # A tibble: 8 x 3
## price_group mean count
## <dbl> <dbl> <int>
## 1 1 27.1 7783
## 2 2 37.9 10114
## 3 3 51.9 9266
## 4 4 67.7 7429
## 5 5 82.9 5475
## 6 6 115. 5578
## 7 7 274. 2838
## 8 NA 843 1
# tokenize by price_group
price_group_words <- token_df2 %>%
count(price_group, word, sort = TRUE)
# word count for each price_group
total_price_group_words <- price_group_words %>%
group_by(price_group) %>%
summarize(total = sum(n))
# left join
price_group_words <- price_group_words %>%
left_join(total_price_group_words)
# visualization of the frequency
price_group_words %>%
mutate(tf = n/total, price_group2 = swr(price_group)) %>%
ggplot(aes(x=tf,fill=price_group2))+
geom_histogram(show.legend = FALSE)+
facet_wrap(~price_group2,ncol=4,scales = "free_y", labeller =labeller(price_group2 = c("1" = "0-30 Dollars", "2" = "31-45 Dollars", "3" = "46-60 Dollars", "4" = "61-75 Dollars", "5" = "76-90 Dollars", "6" = "91-150 Dollars", "7" = "More than 150 Dollars")))
From the above, words follow similar pattern where they have the same value of tf across price range.
Let’s apply Zipf’s law.
price_group_words %>%
group_by(factor(price_group)) %>%
mutate(rank = row_number(),tf = n/total) %>%
ungroup() -> zipf_data
# plot
zipf_data %>%
ggplot(aes(rank, tf, color = factor(price_group))) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
We see small deviation of words at the low rank and higher deviation at the high rank where rank is higher than 110 approximately.
TF-IDF by price range
price_group_tf_idf <- price_group_words %>%
bind_tf_idf(word,price_group,n) %>%
select(-total)
# filter data
price_group_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram()
price_group_tf_idf <-price_group_tf_idf %>% filter(tf_idf>0)
# plot a histogram
price_group_tf_idf %>%
ggplot(aes(tf_idf))+geom_histogram()
# set the axis value
options(scipen=10000)
# plot
price_group_tf_idf %>%
group_by(price_group) %>%
arrange(desc(tf_idf)) %>%
slice_max(tf_idf,n=7,with_ties = F) %>%
ungroup() %>%
mutate(price_group2 = swr(price_group)) %>%
ggplot(aes(tf_idf, reorder(word, tf_idf), fill = price_group2)) +
geom_col(show.legend = FALSE) +
facet_wrap(~price_group2, ncol = 4, scales = "free", labeller =labeller(price_group2 = c("1" = "0-30 Dollars", "2" = "31-45 Dollars", "3" = "46-60 Dollars", "4" = "61-75 Dollars", "5" = "76-90 Dollars", "6" = "91-150 Dollars", "7" = "More than 150 Dollars"))) +
labs(x = "tf-idf", y = "Top 7 Words") + theme(strip.background = element_blank(), strip.placement = "outside")
# explore the distribution of review rating score
hist(token_df2$review_scores_rating)
# grouping the data into 6 groups
token_df2$rating_group <- as.numeric(cut(token_df2$review_scores_rating, breaks = c(0,75,80,85,90,95,100)))
token_df2 = token_df2 %>% filter(!is.na(rating_group))
rating_group_words <- token_df2 %>%
count(rating_group, word, sort = TRUE)
# word count for each rating_group
total_rating_group_words <- rating_group_words %>%
group_by(rating_group) %>%
dplyr::summarize(total = sum(n))
# left join
rating_group_words <- rating_group_words %>%
left_join(total_rating_group_words)
# visualization of the frequency
rating_group_words %>%
mutate(tf = n/total) %>%
ggplot(aes(x=tf,fill=rating_group))+
geom_histogram(show.legend = FALSE)+
facet_wrap(~rating_group,ncol=3,scales = "free_y")
From the plot above, only those with scores higher than 85, has high frequency of unique/rare words across listings.
Let’s apply Zipf’s law
rating_group_words %>%
group_by(rating_group) %>%
mutate(rank = row_number(),tf = n/total, rating_group = factor(rating_group)) %>%
ungroup() -> zipf_data
# plot
zipf_data %>%
ggplot(aes(rank, tf, color = rating_group)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_color_manual(name="Rating Group",
labels=c("0-75", "76-80", "81-85", "86-90","91-95","96-100"),
values=c("red","green","blue", "black","purple","orange")) +
scale_x_log10() +
scale_y_log10() +
xlab("Rank") +
ylab("tf")
From the plot above, the deviation of words starts at the low rank.
TF-IDF by rating groups
rating_group_tf_idf <- rating_group_words %>%
bind_tf_idf(word,rating_group,n) %>%
select(-total) %>%
arrange(desc(tf_idf))
# remove outliers
rating_group_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram() + xlim(c(0,0.05))
rating_group_tf_idf <-rating_group_tf_idf %>%
filter(tf_idf>0.002)
rating_group_tf_idf %>%
group_by(rating_group)%>%
arrange(desc(tf_idf)) %>%
slice_max(tf_idf,n=7,with_ties = F) %>%
ungroup() %>%
mutate(rating_group2 = swr(rating_group)) %>%
ggplot(aes(tf_idf, word, fill = rating_group2)) +
geom_col(show.legend = FALSE) +
facet_wrap(~rating_group2, ncol = 3, scales = "free", labeller = labeller(rating_group2 = c("1" = "Score 0-75","2" = "Score 76-80", "3"="Score 81-85", "4"="Score 86-90", "5" = "Score 91-95", "6" = "Score 96-100"))) +
labs(x = "tf-idf", y = "Top 7 Words") + theme(strip.background = element_blank(), strip.placement = "outside")
roomtype_words <- token_df2 %>%
count(room_type, word, sort = TRUE)
# word count for each month
total_roomtype_words <- roomtype_words %>%
group_by(room_type) %>%
summarize(total = sum(n))
# left join
roomtype_words <- roomtype_words %>%
left_join(total_roomtype_words)
# visualization of the frequency
roomtype_words %>%
mutate(tf = n/total) %>%
ggplot(aes(x=tf,fill=room_type))+
geom_histogram(show.legend = FALSE)+
xlim(0,0.2)+
facet_wrap(~room_type,ncol=2,scales = "free_y")
roomtype_words %>%
group_by(room_type) %>%
mutate(rank = row_number(),tf = n/total) %>%
ungroup() -> zipf_data
# plot
zipf_data %>%
ggplot(aes(rank, tf, color = room_type)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) +
scale_x_log10() +
scale_y_log10()
From the above, shared room contains fewer number of words.
In addition, we observe deviation of words of hotel room type starting at words with the rank 80-100 while the deviation between entire home and private room types starts at the rank higher than 140.
TF-IDF by room types
roomtype_tf_idf <- roomtype_words %>%
bind_tf_idf(word,room_type,n) %>%
select(-total)
# explore the distribution
roomtype_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram()
From the plot above, we see that common words have high frequency. Hence, we need to remove this.
Filter data
roomtype_tf_idf <-roomtype_tf_idf %>%
filter(tf_idf>0.001)
roomtype_tf_idf %>%
filter(tf_idf>0) %>%
ggplot(aes(tf_idf))+geom_histogram()
roomtype_tf_idf %>%
group_by(room_type)%>%
arrange(desc(tf_idf)) %>%
slice_max(tf_idf,n=7,with_ties = F) %>%
ungroup() %>%
ggplot(aes(tf_idf, reorder(word, tf_idf), fill = room_type)) +
geom_col(show.legend = FALSE) +
facet_wrap(~room_type, ncol = 6, scales = "free") +
labs(x = "tf-idf", y = "Top 7 Words")
# read data
df2 = readRDS("df2.rds")
listings.description <- df2 %>%
distinct(listing_id,description,neighbourhood,review_scores_rating,price_num,property_type, host_id,host_is_superhost, rating_group)
# preparing the data frame on which analysis will be done
listings.description %>% unnest_tokens(word,description) -> listings.description.working
# removing the stop words
data("stop_words")
# adding custom stop words in a csv and reading from that file
custom_stopwords <- read.csv("customStopWord.csv", header = TRUE)
# removing the stop words from the working dataframe
listings.description.working <- listings.description.working %>%
anti_join(stop_words) %>% anti_join(custom_stopwords) %>%
# keeping only the words and removing everything extra
mutate(word = str_extract(word, "[a-z']+")) %>%
filter(is.na(word)==FALSE)
# check the language and remove non-English words
listings.description.working$lan <- cld3::detect_language(listings.description.working$word)
listings.description.working <- listings.description.working %>% filter(lan=="en")
# check the language again using different library to improve accuracy
spell_check = hunspell_check(listings.description.working$word)
listings.description.working = listings.description.working[spell_check,]
listings.description.working %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
coord_flip()
From the above, we could see top 3 of frequent words are about space, bed, equipped. This potentially indicates about room characteristics that guests are looking for where hosts use these words for listing descriptions.
Let’s see the top 10 words in description w.r.t. review rating score
# most common words used when rating group = 1
rating4.maxwords <- listings.description.working %>%
filter(rating_group==1) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 0-75")+
coord_flip()
# most common words used when rating group = 2
rating6.maxwords <- listings.description.working %>%
filter(rating_group==2) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 76-80")+
coord_flip()
# most common words used when rating group = 3
rating7.maxwords <- listings.description.working %>%
filter(rating_group==3) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 81-85")+
coord_flip()
# most common words used when rating group = 4
rating8.maxwords <- listings.description.working %>%
filter(rating_group==4) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 86-90")+
coord_flip()
# most common words used when rating group = 5
rating9.maxwords <- listings.description.working %>%
filter(rating_group==5) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 91-95")+
coord_flip()
# most common words used when rating group = 6
rating10.maxwords <- listings.description.working %>%
filter(rating_group==6) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 96-100")+
coord_flip()
maxwords.plots = arrangeGrob(rating4.maxwords,rating6.maxwords, rating7.maxwords, rating8.maxwords, rating9.maxwords, rating10.maxwords)
grid.arrange(maxwords.plots, ncol = 2, widths = c(3/4,1/4))
# create a vector filled with random normal values
u1 <- lapply(listings.description.working %>% distinct(neighbourhood),as.character)
u2 <- lapply(listings.description.working %>% distinct(neighbourhood),as.character)
u7 <- ""
rating.neighbourhood <- 0
for(i in 1:length(u1[[1]])) {
u1[[1]][i] <- str_replace_all(u1[[1]][i]," ","")
u1[[1]][i] <- str_replace_all(u1[[1]][i],"&","")
u1[[1]][i] <- str_replace_all(u1[[1]][i],"-","")
assign( paste0("rating.neighbourhood.",u1[[1]][i]) , listings.description.working %>%
filter(neighbourhood==u2[[1]][i]) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = paste0("Neighbourhood = ", u2[[1]][i]))+
coord_flip())
u7 <- paste(u7,paste0("rating.neighbourhood.",u1[[1]][i]),sep=",")
}
# let's see the visualization
rating.neighbourhood.windmillhill
rating.neighbourhood.clifton
rating.neighbourhood.bedminster
rating.neighbourhood.easton
rating.neighbourhood.ashley
rating.neighbourhood.brislingtonwest
rating.neighbourhood.redland
rating.neighbourhood.knowle
rating.neighbourhood.lawrencehill
rating.neighbourhood.stokebishop
rating.neighbourhood.hotwellsharbourside
rating.neighbourhood.henburybrentry
rating.neighbourhood.cotham
rating.neighbourhood.southmead
rating.neighbourhood.cliftondown
rating.neighbourhood.southville
rating.neighbourhood.avonmouthlawrenceweston
rating.neighbourhood.eastville
rating.neighbourhood.brislingtoneast
rating.neighbourhood.central
rating.neighbourhood.bishopstonashleydown
rating.neighbourhood.stgeorgewest
rating.neighbourhood.horfield
rating.neighbourhood.westburyontrymhenleaze
rating.neighbourhood.lockleaze
rating.neighbourhood.hillfields
rating.neighbourhood.bishopsworth
rating.neighbourhood.fromevale
rating.neighbourhood.stgeorgecentral
rating.neighbourhood.hengrovewhitchurchpark
rating.neighbourhood.filwood
rating.neighbourhood.hartcliffewithywood
rating.neighbourhood.stgeorgetroopershill
Considering top 3 neighbourhoods with the most reviews, which are Ashley, Central, and Clifton, the common words are ‘space’, ‘bed’, and ‘stay’.
‘relax’ is the word that deviates Clifton from others while it is unclear to observe such word in Ashley and Central.
Let’s explore the top words used by hosts with multiple listings.
# inspect the number of listings in 'total' column
listings.description %>% group_by(host_id) %>% mutate(total = n()) %>% summary(.)
## listing_id description host_id host_is_superhost
## Min. : 70820 Length:768 Min. : 52536 FALSE:399
## 1st Qu.:15302270 Class :character 1st Qu.: 20742039 TRUE :340
## Median :23322638 Mode :character Median : 58402902 NA's : 29
## Mean :22988684 Mean : 80545351
## 3rd Qu.:32288291 3rd Qu.:125651447
## Max. :40854693 Max. :314677302
##
## property_type review_scores_rating rating_group price_num
## Length:768 Min. : 40.0 Min. :1.000 Min. : 1.00
## Class :character 1st Qu.: 93.0 1st Qu.:5.000 1st Qu.: 38.00
## Mode :character Median : 97.0 Median :6.000 Median : 62.00
## Mean : 94.9 Mean :5.343 Mean : 82.09
## 3rd Qu.: 99.0 3rd Qu.:6.000 3rd Qu.: 99.25
## Max. :100.0 Max. :6.000 Max. :843.00
## NA's :6 NA's :6
## neighbourhood total
## ashley : 88 Min. : 1.000
## central : 80 1st Qu.: 1.000
## clifton : 59 Median : 1.000
## clifton down : 46 Mean : 7.953
## cotham : 45 3rd Qu.: 3.000
## windmill hill: 41 Max. :52.000
## (Other) :409
# list all the hosts with more than 10 listings
listings.description %>% group_by(host_id) %>% count() %>% arrange(desc(n)) %>% filter(n>10) %>% select(host_id)
## # A tibble: 2 x 1
## # Groups: host_id [2]
## host_id
## <dbl>
## 1 65192629
## 2 125651447
# hosts with multiple listings
host.greaterthan1listing <- listings.description %>% group_by(host_id) %>% count() %>% arrange(desc(n)) %>% filter(n>1)
listings.description.working %>%
filter(host_id %in% host.greaterthan1listing$host_id) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Hosts with multiple listings") +
coord_flip()
# hosts with only 1 listing
host.with1listing <- listings.description %>% group_by(host_id) %>% count() %>% arrange(desc(n)) %>% filter(n==1)
listings.description.working %>% filter(host_id %in% host.with1listing$host_id) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Hosts with 1 listing") +
coord_flip()
grid.arrange(listings.description.working %>% filter(host_id %in% host.with1listing$host_id) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Hosts with 1 listing") +
coord_flip(), listings.description.working %>%
filter(host_id %in% host.greaterthan1listing$host_id) %>%
count(word) %>%
mutate(word=reorder(word,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(word,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Hosts with multiple listings") +
coord_flip(), nrow = 1)
listings.description %>%
mutate(length.description = log(nchar(description))) -> ld
ld %>%
ggplot(aes(x=length.description)) +
geom_histogram(bins = 50)
# identifying outliers using IQR
outliers2 <- boxplot(ld$length.description, ylab = "Description Length")$out
# drop the rows containing outliers
ld <- ld[!ld$length.description %in% outliers2,]
# plot a boxplot without outliers
boxplot(ld$length.description, ylab = "Description Length")
# plot a scatter plot
ld %>% ggplot(aes(x=length.description,y=review_scores_rating))+geom_point()+geom_smooth() + labs(x = "Description length", y="Rating scores", subtitle = "Rating Scores and Description Lenght Relationship")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
Although we removed outliers and applied log transformation with the description length, there is no clear relationship between the review scores and the length.
Let’s explore distinctive words in terms of frequency across rating groups compared with the listings with review scores are 96-100.
listings.description.working %>%
count(rating_group,word) %>%
group_by(rating_group) %>%
mutate(proportion = n/sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = rating_group, values_from = proportion) %>%
pivot_longer('1':'5',
names_to = "rating", values_to = "proportion")->listings.description.working.freq_plot
ggplot(listings.description.working.freq_plot, aes(x = proportion, y = `6`,
color = abs(`6` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~rating, ncol = 2) +
theme(legend.position="none") +
labs(subtitle = "Word Frequency Proportion Against The Highest Scores Group (y-axis)", x = NULL, y = NULL)
listings.description.working.bigrams <- listings.description %>%
unnest_tokens(bigram, description, token="ngrams",n=2) %>%
separate(bigram, c("word1","word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word1 %in% custom_stopwords$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word2 %in% custom_stopwords$word) %>%
unite(bigram, word1, word2, sep = " ")
# top 20 words across all descriptions
listings.description.working.bigrams %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
coord_flip()
Recall that the unigram analysis above provides the sense of space, bed, and equipped.
Here, with bigrams, it provides more understanding and details where hosts use the words ‘double bed’ and ‘double bedroom’ compared to ‘bed’ in the unigram analysis.
It is interesting that ‘guest access’ becomes the most frequent word hosts use, which is potentially important to attract guests.
In addition, we see the words indicating location and amenities such as ‘minutes walk’; ‘walking distance’; ‘equipped kitchen’; ‘washing machine’. This potentially signifies the relationship of the number of amenities, location, room type, to the price or rating scores, which we will investigate more in the last section of part A.
Let’s explore common words by rating groups.
# most common words used when rating group = 1
rating4.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==1) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 0-75") +
coord_flip()
# most common words used when rating group = 2
rating6.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==2) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 76-80")+
coord_flip()
# most common words used when rating group = 3
rating7.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==3) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 81-85") +
coord_flip()
# most common words used when rating group = 4
rating8.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==4) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 86-90") +
coord_flip()
# most common words used when rating group = 5
rating9.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==5) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 91-95")+
coord_flip()
# most common words used when rating group = 6
rating10.maxbigram <- listings.description.working.bigrams %>%
filter(rating_group==6) %>%
count(bigram) %>%
mutate(bigram=reorder(bigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(bigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(title = "Review Score: 96-100")+
coord_flip()
maxbigram.plots = arrangeGrob(rating4.maxbigram,rating6.maxbigram, rating7.maxbigram, rating8.maxbigram, rating9.maxbigram, rating10.maxbigram)
grid.arrange(maxbigram.plots, ncol = 2, widths = c(3/4,1/4))
listings.description.working.trigrams <- listings.description %>%
unnest_tokens(trigram, description, token="ngrams",n=3) %>%
separate(trigram, c("word1","word2","word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word1 %in% custom_stopwords$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word2 %in% custom_stopwords$word) %>%
filter(!word3 %in% stop_words$word) %>%
filter(!word3 %in% custom_stopwords$word) %>%
unite(trigram, word1, word2,word3, sep = " ")
# top 20 words across all descriptions
listings.description.working.trigrams %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_max(n,n=20) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
coord_flip()
Trigrams supplies more details, especially words that are related to amenities such as ‘42 free view’, ‘free view channels’, including words related to bedrooms e.g., ‘king size bed’, ‘en suite shower’
Let’s explore the most common trigrams rating wise.
# most common words used when rating group = 1
rating4.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==1) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 0-75")+
coord_flip()
# most common words used when rating group = 2
rating6.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==2) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 76-80")+
coord_flip()
# most common words used when rating group = 3
rating7.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==3) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_head(n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 81-85")+
coord_flip()
# most common words used when rating group = 4
rating8.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==4) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 86-90")+
coord_flip()
# most common words used when rating group = 5
rating9.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==5) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 91-95")+
coord_flip()
# most common words used when rating group = 6
rating10.maxtrigram <- listings.description.working.trigrams %>%
filter(rating_group==6) %>%
count(trigram) %>%
mutate(trigram=reorder(trigram,n)) %>%
slice_max(n,n=10) %>%
ggplot(aes(trigram,n)) +
geom_bar(stat = "identity") +
xlab(NULL) +
labs(subtitle = "Review Score: 96-100")+
coord_flip()
maxtrigram.plots = arrangeGrob(rating4.maxtrigram,rating6.maxtrigram, rating7.maxtrigram, rating8.maxtrigram, rating9.maxtrigram, rating10.maxtrigram)
grid.arrange(maxtrigram.plots, ncol = 2, widths = c(3/4,1/4))
From the plot, descriptions used by the first group with rating scores 0-75 are missing amentiy-related words. This is possibly the reason why this group has such lower rating scores.
Let’s further analyze the result by using a network visualization of bigrams and trigrams.
# bigrams graph
bigram_graph <- listings.description.working.bigrams %>% separate(bigram, c("word1","word2"), sep=" ") %>%
select(word1,word2) %>% count(word1,word2,sort = TRUE) %>% filter(n>20) %>% graph_from_data_frame()
set.seed(2107)
ggraph(bigram_graph, layout="fr")+
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = grid::arrow(type = "closed", length = unit(.10, "inches")), end_cap= circle(0.07, 'inches')) +
geom_node_point(colour = "lightblue", size=5)+
geom_node_text(aes(label=name), vjust = 1, hjust = 1) +
theme_void()
# trigrams graph
trigram_graph <- listings.description.working.trigrams %>% separate(trigram, c("word1","word2","word3"), sep=" ") %>%
select(word1,word2,word3) %>% count(word1,word2,word3,sort = TRUE) %>% filter(n>10) %>% graph_from_data_frame()
ggraph(trigram_graph, layout="fr")+
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = grid::arrow(type = "closed", length = unit(.10, "inches")), end_cap= circle(0.07, 'inches')) +
geom_node_point(colour = "lightblue", size=5)+
geom_node_text(aes(label=name), vjust = 1, hjust = 1) +
theme_void()
# remove files that are no longer used
remove(hostname)
remove(hostname_vector)
remove(langmodel)
remove(lematized)
remove(listing_tf_idf)
remove(listing_tf_idf_2)
remove(listing_words)
remove(maxbigram.plots)
remove(maxtrigram.plots)
remove(maxwords.plots)
remove(listings.description)
remove(listings.description.byrating)
remove(listings.description.working)
remove(listings.description.working.bigrams)
remove(listings.description.working.trigrams)
remove(listings.description.working.freq_plot)
remove(postagged)
remove(rating10.maxbigram)
remove(rating10.maxtrigram)
remove(rating10.maxwords)
remove(token_df_all)
remove(token_df2)
Is mentioning the name of the owner important?
# read the backup data
df <- readRDS("df2_rev_formality_readability.rds")
df_hostname <- file[[1]] %>% select (listing_id = id,host_name) %>% na.omit()
df_hostname$host_name %>% tolower() -> df_hostname$host_name
df_hostname <- df_hostname %>% unnest_tokens(host_name,host_name)
df_hostname <- df_hostname %>% filter(length(host_name)>2)
# remove stop words
data("stop_words")
df_hostname$host_name <- gsub("[^a-zA-Z\\s]", "",df_hostname$host_name)
df_hostname <- df_hostname %>% anti_join(stop_words, by = c("host_name"="word"))
# joining data
df %>% left_join(df_hostname,by = "listing_id") -> df
We will check the influence of mentioning the name of the owner to the review rating score
Hence, we will do feature engineering by creating a new binary column [0,1] whether a reviewer has mentioned the owner name or not
df$host_name_mentioned <- NA
for(i in 1:nrow(df)){
check_h <- as.numeric(grepl(df$host_name[i],
df$comments[i],
ignore.case = T))
df$host_name_mentioned[i] <- check_h
}
After we use left join df_hostname, we can find the number of observations increases because we separate the multiple hosts to different rows
Now, we will detect the unique_id, which has more than one row
df %>% select(unique_id,host_name_mentioned) %>% group_by(unique_id) %>%summarise( n=n()) %>% filter(n>1) %>% select(unique_id) -> multipleID
# let's filter these reviews and create another data frame,
df %>% select(listing_id,unique_id,host_name_mentioned) %>% filter(unique_id %in% c(multipleID$unique_id)) -> duplicated_df
# calculate the total number of mentions for each review
duplicated_df %>% group_by(unique_id) %>% mutate(n = sum(host_name_mentioned)) -> duplicated_df
# create a dummy variable where 1 equals the number of mentions is equal to 1 or more
duplicated_df$host_name_mentioned <- ifelse(duplicated_df$n>0,1,0)
# for the duplicate ids, we impute the same value across those ids
df[which(df$unique_id %in% c(duplicated_df$unique_id)),]$host_name_mentioned <- duplicated_df$host_name_mentioned
# let's delete the duplicated rows to get the similar number of rows that is definitely the same before we did left join the 'df_hostname' data set
df %>% distinct(unique_id,.keep_all =TRUE) -> df
df %>% select(listing_id,unique_id,host_name_mentioned)
## # A tibble: 38,869 x 3
## listing_id unique_id host_name_mentioned
## <dbl> <dbl> <dbl>
## 1 70820 8163487 1
## 2 70820 8397676 1
## 3 70820 8706695 0
## 4 70820 13024371 1
## 5 70820 14247963 1
## 6 70820 14845082 1
## 7 70820 15989781 1
## 8 70820 16179013 1
## 9 70820 17423387 1
## 10 70820 20072496 1
## # ... with 38,859 more rows
# let's create a box plot
ggplot(subset(df,!is.na(host_name_mentioned)),aes(x=factor(host_name_mentioned),y=review_scores_rating))+geom_boxplot() + labs(x="Host name mentioned", y="Rating scores", subtitle="Rating Scores by Host Name Groups")
# let's test whether there is difference when a reviewer mentions host name or not
t.test(df$review_scores_rating~factor(df$host_name_mentioned))
##
## Welch Two Sample t-test
##
## data: df$review_scores_rating by factor(df$host_name_mentioned)
## t = -39.162, df = 37112, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.375822 -1.244667
## sample estimates:
## mean in group 0 mean in group 1
## 95.35450 96.66475
Using the textual description of the property supplied by the owner, how does this relate with the price that the property is listed for rent?
To analyze the textual description, we use formality and readability extracted from the description. First we read ‘df’ data and only keep one row for each listing since a listing has only one description
# read data
df2 = readRDS("df2.rds")
# for this question we need the average rating score to compare each listing
df_listing <- df2 %>% distinct(listing_id,.keep_all = TRUE)
# calculate the formality
formality_des <- qdap::formality(df_listing$description_ori,df_listing$listing_id)
formality_des$formality %>% select(listing_id,formality) -> formality_calc_des
formality_calc_des$listing_id <- as.numeric(formality_calc_des$listing_id)
# left join
df_listing %>% left_join(formality_calc_des, by = "listing_id") -> df_listing
# calculate the readability
readability_desc_all <- data.frame()
library(qdap)
library(tm)
for(i in 1:nrow(df_listing)){
readability_h <- data.frame()
this_text <- iconv(df_listing$description[i])
this_text <- removeNumbers(this_text)
this_text <- removePunctuation(this_text)
tryCatch(readability_h <- flesch_kincaid(this_text),error=function(e){
cat("Error parsing")
})
if(!is.null(readability_h$Readability)){
readability_h <- readability_h$Readability
readability_h$listing_id <- df_listing$listing_id[i]
readability_desc_all <- bind_rows(readability_desc_all,readability_h)
}
print(i)
}
# save the result to our data frame
df_listing <- df_listing %>%
left_join(readability_desc_all, by = "listing_id")
# save the result
saveRDS(df_listing, file = "df_desc_formality_read.rds")
# df_listing <- readRDS("df_desc_formality_read.rds")
# read data
df_listing = readRDS("df_desc_formality_read.rds")
# non-numerical data set
df_etc_des = df_listing %>% select_if(negate(is.numeric)) %>% select(host_is_superhost:host_identity_verified, room_type,instant_bookable,response_time,neighbourhood)
# numerical data set
df_numeric_des = df_listing %>% select(number_of_reviews, reviews_per_month:review_scores_value,amen_items,ops_year,acceptance_rate,host_total_listings_count, formality, word.count, syllable.count, FK_grd.lvl, FK_read.ease, price_adj)
# create a correlation matrix
correlate(df_numeric_des) -> cordata_3
cordata_3 %>% select(term, price_adj) %>% filter(price_adj > 0.1 | price_adj < -0.1) %>% arrange(desc(price_adj))
## # A tibble: 4 x 2
## term price_adj
## <chr> <dbl>
## 1 host_total_listings_count 0.125
## 2 number_of_reviews -0.150
## 3 review_scores_value -0.163
## 4 reviews_per_month -0.168
preproc3 <- preProcess(df_numeric_des, method=c("range"))
norm3 <- predict(preproc3, df_numeric_des)
df_reg_des = cbind(norm3, df_etc_des)
# formality
df_reg_des %>% select(formality,price_adj) %>% na.omit() %>% ggplot(aes(x=formality,y= price_adj))+geom_smooth(method="loess",se = F)+geom_point() + labs(x="Formality", y="Price", subtitle="Price and Formality Relationship")
## `geom_smooth()` using formula 'y ~ x'
From the result above, it seems there is no clear relationship, which is possibly due to outliers.
Let’s remove outliers using IQR technique.
# explore the distribution
hist(df_reg_des$formality)
hist(df_reg_des$price_adj)
# identifying outliers using IQR
outliers6 <- boxplot(df_reg_des$price_adj, ylab = "Price")$out
# drop the rows containing outliers
df_reg_des <- df_reg_des[!df_reg_des$price_adj %in% outliers6,]
hist(df_reg_des$formality)
hist(df_reg_des$price_adj)
df_reg_des %>% select(formality,price_adj) %>% na.omit() %>% ggplot(aes(x=formality,y= price_adj))+geom_smooth(method="loess",se = F)+geom_point() + labs(x="Formality", y="Price", subtitle="Price and Formality Relationship")
## `geom_smooth()` using formula 'y ~ x'
outliers7 <- boxplot(df_reg_des$FK_grd.lvl, ylab = "FKgrd level")$out
# drop the rows containing outliers
df_reg_des <- df_reg_des[!df_reg_des$FK_grd.lvl %in% outliers7,]
# find the minimum of FK_grd.lvl
df_reg_des$FK_grd.lvl %>% summary(.)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.6546 0.8395 0.8740 0.8652 0.9038 1.0000 1
# FK grd level
df_reg_des %>% select(FK_grd.lvl, price_adj) %>% na.omit() %>% ggplot(aes(x=FK_grd.lvl,y= price_adj))+geom_smooth(method="loess",se = F)+geom_point() + labs(x="Reading grade level", y="Price", subtitle="Price and Reading Grade Level Relationship")
## `geom_smooth()` using formula 'y ~ x'
outliers8 <- boxplot(df_reg_des$FK_read.ease, ylab = "FKrd ease level")$out
# drop the rows containing outliers
df_reg_des <- df_reg_des[!df_reg_des$FK_read.ease %in% outliers8,]
# find the minimum of FK_grd.lvl
df_reg_des$FK_grd.lvl %>% summary(.)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.7413 0.8480 0.8787 0.8773 0.9058 1.0000 1
# FK reading ease level
df_reg_des %>% select(FK_read.ease,price_adj) %>% na.omit() %>% ggplot(aes(x=FK_read.ease,y= price_adj))+geom_smooth(method="loess",se = F)+geom_point() + labs(x="Reading ease level", y="Price", subtitle="Price and Reading Ease Level Relationship")
## `geom_smooth()` using formula 'y ~ x'
# update room types to be factor and remove 'host_has_profile_pic' due to linearity
df_reg_des2 <- df_reg_des %>% mutate(room_type = factor(room_type)) %>% select(-host_has_profile_pic)
# remove NULL values
df_reg_des3 <- df_reg_des2 %>% na.omit()
# regression
model4 <- lm(price_adj ~ ., data = df_reg_des3, na.action = na.exclude)
model4 %>% tidy() %>% kable(
caption = "Coefficient Estimation for Price Prediction",
col.names = c("Predictor", "B", "SE", "t", "p"),
digits = c(0, 2, 2, 2, 2)
)
| Predictor | B | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -0.20 | 0.33 | -0.62 | 0.54 |
| number_of_reviews | -0.09 | 0.13 | -0.71 | 0.48 |
| reviews_per_month | -0.05 | 0.15 | -0.34 | 0.73 |
| review_scores_rating | 0.18 | 0.20 | 0.89 | 0.38 |
| review_scores_accuracy | 0.06 | 0.37 | 0.15 | 0.88 |
| review_scores_cleanliness | 0.05 | 0.14 | 0.35 | 0.72 |
| review_scores_checkin | 0.11 | 0.30 | 0.38 | 0.71 |
| review_scores_communication | -0.01 | 0.32 | -0.02 | 0.99 |
| review_scores_location | 0.23 | 0.12 | 1.97 | 0.05 |
| review_scores_value | -0.13 | 0.14 | -0.93 | 0.35 |
| amen_items | -0.04 | 0.08 | -0.47 | 0.64 |
| ops_year | 0.11 | 0.06 | 1.93 | 0.06 |
| acceptance_rate | 0.07 | 0.04 | 1.82 | 0.07 |
| host_total_listings_count | 1.66 | 1.12 | 1.48 | 0.14 |
| formality | -0.03 | 0.07 | -0.40 | 0.69 |
| word.count | 5.81 | 7.43 | 0.78 | 0.44 |
| syllable.count | 1.80 | 2.32 | 0.78 | 0.44 |
| FK_grd.lvl | -7.50 | 9.57 | -0.78 | 0.44 |
| FK_read.ease | NA | NA | NA | NA |
| host_is_superhostTRUE | 0.00 | 0.02 | 0.17 | 0.87 |
| host_identity_verifiedTRUE | 0.01 | 0.03 | 0.18 | 0.86 |
| room_typePrivate room | -0.06 | 0.02 | -2.62 | 0.01 |
| instant_bookableTRUE | 0.00 | 0.03 | -0.09 | 0.93 |
| response_timewithin a day | 0.02 | 0.02 | 0.97 | 0.34 |
| response_timea few days or more | 0.09 | 0.04 | 2.13 | 0.04 |
| neighbourhoodavonmouth & lawrence weston | -0.15 | 0.05 | -2.81 | 0.01 |
| neighbourhoodbedminster | 0.00 | 0.06 | -0.01 | 0.99 |
| neighbourhoodbishopston & ashley down | 0.00 | 0.05 | 0.09 | 0.93 |
| neighbourhoodbrislington east | -0.26 | 0.11 | -2.51 | 0.01 |
| neighbourhoodbrislington west | 0.05 | 0.10 | 0.51 | 0.61 |
| neighbourhoodcentral | 0.02 | 0.05 | 0.32 | 0.75 |
| neighbourhoodclifton | 0.13 | 0.05 | 2.71 | 0.01 |
| neighbourhoodclifton down | 0.12 | 0.05 | 2.13 | 0.04 |
| neighbourhoodcotham | 0.12 | 0.04 | 2.98 | 0.00 |
| neighbourhoodeaston | -0.07 | 0.05 | -1.31 | 0.19 |
| neighbourhoodeastville | 0.02 | 0.03 | 0.57 | 0.57 |
| neighbourhoodfilwood | -0.08 | 0.07 | -1.15 | 0.25 |
| neighbourhoodfrome vale | -0.11 | 0.10 | -1.11 | 0.27 |
| neighbourhoodhenbury & brentry | -0.09 | 0.09 | -0.92 | 0.36 |
| neighbourhoodhorfield | -0.06 | 0.11 | -0.49 | 0.62 |
| neighbourhoodhotwells & harbourside | 0.05 | 0.04 | 1.23 | 0.22 |
| neighbourhoodknowle | -0.03 | 0.07 | -0.41 | 0.68 |
| neighbourhoodlawrence hill | 0.01 | 0.05 | 0.15 | 0.88 |
| neighbourhoodredland | 0.05 | 0.05 | 1.21 | 0.23 |
| neighbourhoodsouthmead | -0.01 | 0.10 | -0.08 | 0.94 |
| neighbourhoodsouthville | 0.08 | 0.06 | 1.41 | 0.16 |
| neighbourhoodst george troopers hill | 0.04 | 0.07 | 0.60 | 0.55 |
| neighbourhoodst george west | -0.09 | 0.10 | -0.93 | 0.36 |
| neighbourhoodstoke bishop | 0.08 | 0.09 | 0.85 | 0.40 |
| neighbourhoodwestbury-on-trym & henleaze | 0.04 | 0.06 | 0.70 | 0.49 |
| neighbourhoodwindmill hill | -0.09 | 0.03 | -2.73 | 0.01 |
# use a stepwise approach to determine predictors
aic_model4 = MASS::stepAIC(model4, direction = "both")
## Start: AIC=-599.59
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_accuracy + review_scores_cleanliness + review_scores_checkin +
## review_scores_communication + review_scores_location + review_scores_value +
## amen_items + ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + FK_read.ease +
## host_is_superhost + host_identity_verified + room_type +
## instant_bookable + response_time + neighbourhood
##
##
## Step: AIC=-599.59
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_accuracy + review_scores_cleanliness + review_scores_checkin +
## review_scores_communication + review_scores_location + review_scores_value +
## amen_items + ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + host_is_superhost +
## host_identity_verified + room_type + instant_bookable + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - review_scores_communication 1 0.00000 0.56927 -601.59
## - instant_bookable 1 0.00006 0.56933 -601.58
## - review_scores_accuracy 1 0.00016 0.56943 -601.56
## - host_is_superhost 1 0.00021 0.56947 -601.55
## - host_identity_verified 1 0.00024 0.56951 -601.54
## - reviews_per_month 1 0.00084 0.57011 -601.41
## - review_scores_cleanliness 1 0.00091 0.57017 -601.39
## - review_scores_checkin 1 0.00102 0.57029 -601.36
## - formality 1 0.00118 0.57044 -601.33
## - amen_items 1 0.00162 0.57088 -601.23
## - number_of_reviews 1 0.00363 0.57290 -600.77
## - syllable.count 1 0.00437 0.57364 -600.61
## - word.count 1 0.00440 0.57367 -600.60
## - FK_grd.lvl 1 0.00442 0.57369 -600.60
## - review_scores_rating 1 0.00570 0.57496 -600.31
## - review_scores_value 1 0.00624 0.57551 -600.19
## <none> 0.56927 -599.59
## - host_total_listings_count 1 0.01585 0.58512 -598.05
## - acceptance_rate 1 0.02387 0.59314 -596.30
## - response_time 2 0.03507 0.60434 -595.88
## - ops_year 1 0.02688 0.59614 -595.64
## - review_scores_location 1 0.02788 0.59715 -595.43
## - room_type 1 0.04946 0.61873 -590.85
## - neighbourhood 26 0.50744 1.07671 -569.38
##
## Step: AIC=-601.59
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_accuracy + review_scores_cleanliness + review_scores_checkin +
## review_scores_location + review_scores_value + amen_items +
## ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + host_is_superhost +
## host_identity_verified + room_type + instant_bookable + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - instant_bookable 1 0.00006 0.56933 -603.58
## - review_scores_accuracy 1 0.00021 0.56948 -603.55
## - host_is_superhost 1 0.00022 0.56949 -603.55
## - host_identity_verified 1 0.00026 0.56953 -603.54
## - reviews_per_month 1 0.00085 0.57012 -603.40
## - review_scores_cleanliness 1 0.00092 0.57020 -603.39
## - review_scores_checkin 1 0.00108 0.57036 -603.35
## - formality 1 0.00134 0.57062 -603.29
## - amen_items 1 0.00168 0.57095 -603.21
## - number_of_reviews 1 0.00364 0.57291 -602.77
## - syllable.count 1 0.00438 0.57365 -602.61
## - word.count 1 0.00442 0.57369 -602.60
## - FK_grd.lvl 1 0.00443 0.57370 -602.59
## - review_scores_rating 1 0.00633 0.57560 -602.17
## - review_scores_value 1 0.00688 0.57615 -602.04
## <none> 0.56927 -601.59
## - host_total_listings_count 1 0.01609 0.58536 -600.00
## + review_scores_communication 1 0.00000 0.56927 -599.59
## - acceptance_rate 1 0.02555 0.59483 -597.93
## - response_time 2 0.03603 0.60530 -597.68
## - ops_year 1 0.02746 0.59673 -597.52
## - review_scores_location 1 0.02878 0.59805 -597.23
## - room_type 1 0.04967 0.61894 -592.80
## - neighbourhood 26 0.51777 1.08704 -570.15
##
## Step: AIC=-603.58
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_accuracy + review_scores_cleanliness + review_scores_checkin +
## review_scores_location + review_scores_value + amen_items +
## ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + host_is_superhost +
## host_identity_verified + room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - review_scores_accuracy 1 0.00021 0.56954 -605.53
## - host_is_superhost 1 0.00028 0.56961 -605.52
## - host_identity_verified 1 0.00041 0.56974 -605.49
## - review_scores_cleanliness 1 0.00092 0.57025 -605.37
## - reviews_per_month 1 0.00101 0.57034 -605.35
## - review_scores_checkin 1 0.00107 0.57040 -605.34
## - formality 1 0.00136 0.57069 -605.27
## - amen_items 1 0.00169 0.57102 -605.20
## - number_of_reviews 1 0.00365 0.57298 -604.76
## - syllable.count 1 0.00496 0.57429 -604.46
## - word.count 1 0.00498 0.57431 -604.46
## - FK_grd.lvl 1 0.00499 0.57431 -604.46
## - review_scores_rating 1 0.00635 0.57568 -604.15
## - review_scores_value 1 0.00705 0.57638 -603.99
## <none> 0.56933 -603.58
## - host_total_listings_count 1 0.01644 0.58577 -601.91
## + instant_bookable 1 0.00006 0.56927 -601.59
## + review_scores_communication 1 0.00000 0.56933 -601.58
## - ops_year 1 0.02765 0.59698 -599.46
## - acceptance_rate 1 0.02790 0.59723 -599.41
## - response_time 2 0.03726 0.60659 -599.40
## - review_scores_location 1 0.02935 0.59868 -599.10
## - room_type 1 0.05234 0.62167 -594.24
## - neighbourhood 26 0.51934 1.08867 -571.96
##
## Step: AIC=-605.53
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_cleanliness + review_scores_checkin + review_scores_location +
## review_scores_value + amen_items + ops_year + acceptance_rate +
## host_total_listings_count + formality + word.count + syllable.count +
## FK_grd.lvl + host_is_superhost + host_identity_verified +
## room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - host_is_superhost 1 0.00035 0.56989 -607.45
## - host_identity_verified 1 0.00048 0.57002 -607.42
## - reviews_per_month 1 0.00101 0.57055 -607.31
## - review_scores_cleanliness 1 0.00111 0.57065 -607.28
## - formality 1 0.00138 0.57092 -607.22
## - amen_items 1 0.00169 0.57123 -607.15
## - review_scores_checkin 1 0.00171 0.57125 -607.15
## - number_of_reviews 1 0.00366 0.57320 -606.71
## - syllable.count 1 0.00476 0.57430 -606.46
## - word.count 1 0.00478 0.57432 -606.45
## - FK_grd.lvl 1 0.00479 0.57433 -606.45
## - review_scores_value 1 0.00691 0.57646 -605.98
## - review_scores_rating 1 0.00854 0.57808 -605.61
## <none> 0.56954 -605.53
## - host_total_listings_count 1 0.01628 0.58582 -603.90
## + review_scores_accuracy 1 0.00021 0.56933 -603.58
## + instant_bookable 1 0.00006 0.56948 -603.55
## + review_scores_communication 1 0.00005 0.56949 -603.55
## - ops_year 1 0.02785 0.59739 -601.37
## - acceptance_rate 1 0.02798 0.59752 -601.35
## - response_time 2 0.03798 0.60752 -601.20
## - review_scores_location 1 0.02978 0.59932 -600.96
## - room_type 1 0.05235 0.62189 -596.19
## - neighbourhood 26 0.51914 1.08868 -573.96
##
## Step: AIC=-607.45
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_cleanliness + review_scores_checkin + review_scores_location +
## review_scores_value + amen_items + ops_year + acceptance_rate +
## host_total_listings_count + formality + word.count + syllable.count +
## FK_grd.lvl + host_identity_verified + room_type + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - host_identity_verified 1 0.00063 0.57052 -609.31
## - reviews_per_month 1 0.00099 0.57088 -609.23
## - review_scores_cleanliness 1 0.00116 0.57105 -609.19
## - formality 1 0.00122 0.57111 -609.18
## - review_scores_checkin 1 0.00170 0.57159 -609.07
## - amen_items 1 0.00185 0.57174 -609.04
## - number_of_reviews 1 0.00362 0.57351 -608.64
## - syllable.count 1 0.00503 0.57492 -608.32
## - FK_grd.lvl 1 0.00506 0.57495 -608.31
## - word.count 1 0.00508 0.57497 -608.31
## - review_scores_value 1 0.00715 0.57704 -607.85
## <none> 0.56989 -607.45
## - review_scores_rating 1 0.00898 0.57887 -607.44
## - host_total_listings_count 1 0.01608 0.58597 -605.87
## + host_is_superhost 1 0.00035 0.56954 -605.53
## + review_scores_accuracy 1 0.00028 0.56961 -605.52
## + instant_bookable 1 0.00014 0.56976 -605.48
## + review_scores_communication 1 0.00004 0.56985 -605.46
## - ops_year 1 0.02750 0.59739 -603.37
## - acceptance_rate 1 0.02817 0.59806 -603.23
## - response_time 2 0.04096 0.61085 -602.50
## - review_scores_location 1 0.03880 0.60869 -600.96
## - room_type 1 0.05385 0.62374 -597.81
## - neighbourhood 26 0.52120 1.09109 -575.67
##
## Step: AIC=-609.31
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_cleanliness + review_scores_checkin + review_scores_location +
## review_scores_value + amen_items + ops_year + acceptance_rate +
## host_total_listings_count + formality + word.count + syllable.count +
## FK_grd.lvl + room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - reviews_per_month 1 0.00095 0.57148 -611.10
## - formality 1 0.00107 0.57159 -611.07
## - review_scores_cleanliness 1 0.00124 0.57176 -611.03
## - amen_items 1 0.00171 0.57223 -610.92
## - review_scores_checkin 1 0.00203 0.57255 -610.85
## - number_of_reviews 1 0.00379 0.57431 -610.46
## - syllable.count 1 0.00469 0.57521 -610.26
## - FK_grd.lvl 1 0.00475 0.57527 -610.24
## - word.count 1 0.00477 0.57529 -610.24
## - review_scores_value 1 0.00669 0.57722 -609.81
## - review_scores_rating 1 0.00874 0.57926 -609.35
## <none> 0.57052 -609.31
## + host_identity_verified 1 0.00063 0.56989 -607.45
## - host_total_listings_count 1 0.01737 0.58789 -607.44
## + host_is_superhost 1 0.00050 0.57002 -607.42
## + instant_bookable 1 0.00042 0.57010 -607.41
## + review_scores_accuracy 1 0.00039 0.57013 -607.40
## + review_scores_communication 1 0.00002 0.57050 -607.32
## - ops_year 1 0.02770 0.59822 -605.20
## - acceptance_rate 1 0.02842 0.59894 -605.04
## - response_time 2 0.04047 0.61099 -604.47
## - review_scores_location 1 0.03818 0.60870 -602.96
## - room_type 1 0.06032 0.63084 -598.35
## - neighbourhood 26 0.52597 1.09649 -577.03
##
## Step: AIC=-611.1
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_cleanliness +
## review_scores_checkin + review_scores_location + review_scores_value +
## amen_items + ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + room_type +
## response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - review_scores_cleanliness 1 0.00094 0.57241 -612.88
## - formality 1 0.00116 0.57264 -612.83
## - amen_items 1 0.00166 0.57313 -612.72
## - review_scores_checkin 1 0.00210 0.57358 -612.62
## - syllable.count 1 0.00517 0.57665 -611.93
## - FK_grd.lvl 1 0.00525 0.57672 -611.92
## - word.count 1 0.00527 0.57674 -611.91
## - review_scores_value 1 0.00682 0.57830 -611.56
## <none> 0.57148 -611.10
## - review_scores_rating 1 0.00953 0.58101 -610.96
## + reviews_per_month 1 0.00095 0.57052 -609.31
## + instant_bookable 1 0.00069 0.57079 -609.25
## + host_identity_verified 1 0.00059 0.57088 -609.23
## + host_is_superhost 1 0.00048 0.57100 -609.20
## + review_scores_accuracy 1 0.00038 0.57110 -609.18
## + review_scores_communication 1 0.00000 0.57147 -609.10
## - host_total_listings_count 1 0.01819 0.58966 -609.05
## - acceptance_rate 1 0.02761 0.59909 -607.01
## - number_of_reviews 1 0.02958 0.60106 -606.59
## - ops_year 1 0.03059 0.60207 -606.37
## - response_time 2 0.04079 0.61226 -606.20
## - review_scores_location 1 0.03735 0.60882 -604.93
## - room_type 1 0.05952 0.63100 -600.31
## - neighbourhood 26 0.53592 1.10739 -577.76
##
## Step: AIC=-612.88
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_checkin +
## review_scores_location + review_scores_value + amen_items +
## ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + room_type +
## response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - formality 1 0.00137 0.57378 -614.58
## - amen_items 1 0.00206 0.57447 -614.42
## - review_scores_checkin 1 0.00227 0.57469 -614.37
## - review_scores_value 1 0.00619 0.57860 -613.50
## - word.count 1 0.00755 0.57996 -613.19
## - syllable.count 1 0.00756 0.57997 -613.19
## - FK_grd.lvl 1 0.00758 0.57999 -613.19
## <none> 0.57241 -612.88
## + review_scores_cleanliness 1 0.00094 0.57148 -611.10
## + instant_bookable 1 0.00067 0.57174 -611.04
## + host_identity_verified 1 0.00067 0.57174 -611.03
## + reviews_per_month 1 0.00065 0.57176 -611.03
## + review_scores_accuracy 1 0.00060 0.57181 -611.02
## + host_is_superhost 1 0.00054 0.57187 -611.01
## + review_scores_communication 1 0.00001 0.57240 -610.89
## - host_total_listings_count 1 0.01945 0.59186 -610.58
## - number_of_reviews 1 0.02865 0.60106 -608.59
## - acceptance_rate 1 0.02867 0.60108 -608.58
## - response_time 2 0.04041 0.61283 -608.08
## - ops_year 1 0.03224 0.60465 -607.82
## - review_scores_rating 1 0.03340 0.60581 -607.57
## - review_scores_location 1 0.03653 0.60894 -606.90
## - room_type 1 0.05916 0.63157 -602.20
## - neighbourhood 26 0.53629 1.10870 -579.60
##
## Step: AIC=-614.58
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_checkin +
## review_scores_location + review_scores_value + amen_items +
## ops_year + acceptance_rate + host_total_listings_count +
## word.count + syllable.count + FK_grd.lvl + room_type + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - review_scores_checkin 1 0.00230 0.57608 -616.06
## - amen_items 1 0.00240 0.57619 -616.04
## - review_scores_value 1 0.00594 0.57973 -615.25
## - FK_grd.lvl 1 0.00896 0.58275 -614.58
## <none> 0.57378 -614.58
## - syllable.count 1 0.00897 0.58276 -614.57
## - word.count 1 0.00908 0.58287 -614.55
## + formality 1 0.00137 0.57241 -612.88
## + review_scores_cleanliness 1 0.00115 0.57264 -612.83
## + reviews_per_month 1 0.00071 0.57308 -612.73
## + instant_bookable 1 0.00062 0.57316 -612.72
## + review_scores_accuracy 1 0.00060 0.57318 -612.71
## + host_identity_verified 1 0.00049 0.57329 -612.69
## + host_is_superhost 1 0.00032 0.57347 -612.65
## + review_scores_communication 1 0.00004 0.57374 -612.59
## - host_total_listings_count 1 0.01910 0.59289 -612.35
## - acceptance_rate 1 0.02911 0.60289 -610.19
## - number_of_reviews 1 0.03004 0.60382 -609.99
## - response_time 2 0.04029 0.61408 -609.82
## - ops_year 1 0.03108 0.60487 -609.77
## - review_scores_rating 1 0.03241 0.60619 -609.49
## - review_scores_location 1 0.03958 0.61336 -607.97
## - room_type 1 0.05900 0.63278 -603.95
## - neighbourhood 26 0.53663 1.11041 -581.41
##
## Step: AIC=-616.06
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## review_scores_value + amen_items + ops_year + acceptance_rate +
## host_total_listings_count + word.count + syllable.count +
## FK_grd.lvl + room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - review_scores_value 1 0.00442 0.58050 -617.07
## - amen_items 1 0.00495 0.58103 -616.96
## <none> 0.57608 -616.06
## - syllable.count 1 0.00940 0.58548 -615.97
## - FK_grd.lvl 1 0.00941 0.58549 -615.97
## - word.count 1 0.00958 0.58566 -615.93
## + review_scores_checkin 1 0.00230 0.57378 -614.58
## + review_scores_accuracy 1 0.00173 0.57436 -614.45
## + formality 1 0.00140 0.57469 -614.37
## + review_scores_cleanliness 1 0.00134 0.57474 -614.36
## + host_identity_verified 1 0.00081 0.57527 -614.24
## + reviews_per_month 1 0.00075 0.57533 -614.23
## + instant_bookable 1 0.00066 0.57542 -614.21
## - host_total_listings_count 1 0.01767 0.59375 -614.16
## + host_is_superhost 1 0.00035 0.57574 -614.14
## + review_scores_communication 1 0.00022 0.57586 -614.11
## - number_of_reviews 1 0.02907 0.60515 -611.71
## - ops_year 1 0.03142 0.60750 -611.21
## - acceptance_rate 1 0.03271 0.60879 -610.94
## - review_scores_rating 1 0.03556 0.61164 -610.33
## - response_time 2 0.04616 0.62224 -610.12
## - review_scores_location 1 0.04277 0.61885 -608.82
## - room_type 1 0.05907 0.63515 -605.47
## - neighbourhood 26 0.54027 1.11635 -582.72
##
## Step: AIC=-617.07
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## amen_items + ops_year + acceptance_rate + host_total_listings_count +
## word.count + syllable.count + FK_grd.lvl + room_type + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - amen_items 1 0.00648 0.58698 -617.64
## - syllable.count 1 0.00822 0.58872 -617.26
## - FK_grd.lvl 1 0.00823 0.58873 -617.26
## - word.count 1 0.00834 0.58884 -617.23
## <none> 0.58050 -617.07
## + review_scores_value 1 0.00442 0.57608 -616.06
## + formality 1 0.00116 0.57934 -615.33
## + reviews_per_month 1 0.00093 0.57957 -615.28
## + instant_bookable 1 0.00082 0.57968 -615.26
## + review_scores_checkin 1 0.00077 0.57973 -615.25
## + review_scores_accuracy 1 0.00069 0.57981 -615.23
## + review_scores_cleanliness 1 0.00055 0.57995 -615.20
## + host_is_superhost 1 0.00045 0.58005 -615.17
## + host_identity_verified 1 0.00024 0.58026 -615.13
## + review_scores_communication 1 0.00009 0.58041 -615.09
## - host_total_listings_count 1 0.02021 0.60071 -614.66
## - number_of_reviews 1 0.02875 0.60925 -612.84
## - review_scores_rating 1 0.03136 0.61186 -612.29
## - ops_year 1 0.03227 0.61277 -612.10
## - acceptance_rate 1 0.03412 0.61462 -611.71
## - response_time 2 0.04596 0.62646 -611.25
## - review_scores_location 1 0.04044 0.62094 -610.39
## - room_type 1 0.06879 0.64929 -604.63
## - neighbourhood 26 0.54156 1.12206 -584.06
##
## Step: AIC=-617.64
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## ops_year + acceptance_rate + host_total_listings_count +
## word.count + syllable.count + FK_grd.lvl + room_type + response_time +
## neighbourhood
##
## Df Sum of Sq RSS AIC
## - syllable.count 1 0.00824 0.59523 -617.84
## - FK_grd.lvl 1 0.00858 0.59556 -617.77
## - word.count 1 0.00874 0.59572 -617.74
## <none> 0.58698 -617.64
## + amen_items 1 0.00648 0.58050 -617.07
## + review_scores_value 1 0.00595 0.58103 -616.96
## + review_scores_checkin 1 0.00264 0.58434 -616.22
## + formality 1 0.00174 0.58524 -616.02
## + review_scores_accuracy 1 0.00139 0.58559 -615.95
## + review_scores_cleanliness 1 0.00124 0.58574 -615.91
## + instant_bookable 1 0.00085 0.58613 -615.83
## + host_is_superhost 1 0.00082 0.58617 -615.82
## + reviews_per_month 1 0.00077 0.58621 -615.81
## + review_scores_communication 1 0.00015 0.58683 -615.67
## + host_identity_verified 1 0.00013 0.58686 -615.67
## - host_total_listings_count 1 0.01998 0.60696 -615.32
## - number_of_reviews 1 0.02615 0.61313 -614.02
## - acceptance_rate 1 0.02843 0.61541 -613.54
## - review_scores_rating 1 0.02898 0.61596 -613.43
## - ops_year 1 0.03085 0.61783 -613.04
## - response_time 2 0.04604 0.63302 -611.90
## - review_scores_location 1 0.03871 0.62569 -611.40
## - room_type 1 0.06682 0.65380 -605.74
## - neighbourhood 26 0.54129 1.12827 -585.35
##
## Step: AIC=-617.84
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## ops_year + acceptance_rate + host_total_listings_count +
## word.count + FK_grd.lvl + room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - FK_grd.lvl 1 0.00066 0.59589 -619.70
## - word.count 1 0.00110 0.59632 -619.61
## <none> 0.59523 -617.84
## + syllable.count 1 0.00824 0.58698 -617.64
## + amen_items 1 0.00650 0.58872 -617.26
## + review_scores_value 1 0.00458 0.59065 -616.84
## + review_scores_cleanliness 1 0.00427 0.59095 -616.77
## + review_scores_checkin 1 0.00329 0.59194 -616.56
## + formality 1 0.00326 0.59196 -616.55
## + instant_bookable 1 0.00248 0.59275 -616.38
## + host_is_superhost 1 0.00117 0.59405 -616.10
## + reviews_per_month 1 0.00104 0.59419 -616.07
## + review_scores_accuracy 1 0.00052 0.59470 -615.96
## + review_scores_communication 1 0.00034 0.59489 -615.92
## + host_identity_verified 1 0.00001 0.59522 -615.84
## - number_of_reviews 1 0.02261 0.61783 -615.03
## - host_total_listings_count 1 0.02600 0.62123 -614.33
## - acceptance_rate 1 0.02763 0.62286 -613.99
## - ops_year 1 0.02789 0.62311 -613.94
## - review_scores_rating 1 0.03389 0.62912 -612.70
## - response_time 2 0.04818 0.64341 -611.80
## - review_scores_location 1 0.04181 0.63703 -611.09
## - room_type 1 0.08630 0.68152 -602.38
## - neighbourhood 26 0.54752 1.14274 -585.70
##
## Step: AIC=-619.7
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## ops_year + acceptance_rate + host_total_listings_count +
## word.count + room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## - word.count 1 0.00105 0.59694 -621.47
## <none> 0.59589 -619.70
## + amen_items 1 0.00714 0.58874 -619.26
## + review_scores_value 1 0.00468 0.59121 -618.72
## + review_scores_cleanliness 1 0.00412 0.59177 -618.59
## + review_scores_checkin 1 0.00357 0.59232 -618.47
## + formality 1 0.00325 0.59263 -618.41
## + instant_bookable 1 0.00243 0.59345 -618.23
## + host_is_superhost 1 0.00125 0.59463 -617.97
## + reviews_per_month 1 0.00112 0.59476 -617.94
## + FK_grd.lvl 1 0.00066 0.59523 -617.84
## + FK_read.ease 1 0.00066 0.59523 -617.84
## + review_scores_accuracy 1 0.00051 0.59537 -617.81
## + syllable.count 1 0.00032 0.59556 -617.77
## + review_scores_communication 1 0.00030 0.59558 -617.77
## + host_identity_verified 1 0.00003 0.59586 -617.71
## - number_of_reviews 1 0.02232 0.61821 -616.96
## - acceptance_rate 1 0.02709 0.62298 -615.96
## - ops_year 1 0.02785 0.62374 -615.81
## - host_total_listings_count 1 0.02870 0.62459 -615.63
## - response_time 2 0.04808 0.64397 -613.69
## - review_scores_rating 1 0.03905 0.63493 -613.51
## - review_scores_location 1 0.04160 0.63748 -613.00
## - room_type 1 0.08572 0.68160 -604.36
## - neighbourhood 26 0.54686 1.14275 -587.70
##
## Step: AIC=-621.47
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## ops_year + acceptance_rate + host_total_listings_count +
## room_type + response_time + neighbourhood
##
## Df Sum of Sq RSS AIC
## <none> 0.59694 -621.47
## + amen_items 1 0.00741 0.58953 -621.08
## + review_scores_checkin 1 0.00435 0.59259 -620.42
## + formality 1 0.00429 0.59265 -620.40
## + review_scores_cleanliness 1 0.00402 0.59292 -620.34
## + review_scores_value 1 0.00371 0.59323 -620.28
## + instant_bookable 1 0.00295 0.59399 -620.11
## + host_is_superhost 1 0.00168 0.59526 -619.84
## + reviews_per_month 1 0.00115 0.59579 -619.72
## + word.count 1 0.00105 0.59589 -619.70
## + review_scores_accuracy 1 0.00088 0.59606 -619.66
## + FK_grd.lvl 1 0.00062 0.59632 -619.61
## + syllable.count 1 0.00010 0.59684 -619.49
## + host_identity_verified 1 0.00004 0.59690 -619.48
## + review_scores_communication 1 0.00001 0.59693 -619.47
## + FK_read.ease 1 0.00000 0.59694 -619.47
## - number_of_reviews 1 0.02144 0.61838 -618.92
## - acceptance_rate 1 0.02705 0.62399 -617.76
## - host_total_listings_count 1 0.02912 0.62606 -617.33
## - ops_year 1 0.03124 0.62817 -616.89
## - review_scores_rating 1 0.03964 0.63658 -615.18
## - response_time 2 0.05024 0.64718 -615.05
## - review_scores_location 1 0.04269 0.63963 -614.56
## - room_type 1 0.08685 0.68379 -605.95
## - neighbourhood 26 0.54588 1.14281 -589.70
aic_model4$anova
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## price_adj ~ number_of_reviews + reviews_per_month + review_scores_rating +
## review_scores_accuracy + review_scores_cleanliness + review_scores_checkin +
## review_scores_communication + review_scores_location + review_scores_value +
## amen_items + ops_year + acceptance_rate + host_total_listings_count +
## formality + word.count + syllable.count + FK_grd.lvl + FK_read.ease +
## host_is_superhost + host_identity_verified + room_type +
## instant_bookable + response_time + neighbourhood
##
## Final Model:
## price_adj ~ number_of_reviews + review_scores_rating + review_scores_location +
## ops_year + acceptance_rate + host_total_listings_count +
## room_type + response_time + neighbourhood
##
##
## Step Df Deviance Resid. Df Resid. Dev
## 1 79 0.5692684
## 2 - FK_read.ease 0 0.000000000000 79 0.5692684
## 3 - review_scores_communication 1 0.000002209098 80 0.5692706
## 4 - instant_bookable 1 0.000058600993 81 0.5693292
## 5 - review_scores_accuracy 1 0.000211048081 82 0.5695402
## 6 - host_is_superhost 1 0.000350201736 83 0.5698904
## 7 - host_identity_verified 1 0.000630306748 84 0.5705207
## 8 - reviews_per_month 1 0.000954510557 85 0.5714753
## 9 - review_scores_cleanliness 1 0.000935741754 86 0.5724110
## 10 - formality 1 0.001371273334 87 0.5737823
## 11 - review_scores_checkin 1 0.002298857138 88 0.5760811
## 12 - review_scores_value 1 0.004419433878 89 0.5805006
## 13 - amen_items 1 0.006480164203 90 0.5869807
## 14 - syllable.count 1 0.008244940849 91 0.5952257
## 15 - FK_grd.lvl 1 0.000659882062 92 0.5958856
## 16 - word.count 1 0.001052907607 93 0.5969385
## AIC
## 1 -599.5948
## 2 -599.5948
## 3 -601.5943
## 4 -603.5810
## 5 -605.5332
## 6 -607.4539
## 7 -609.3113
## 8 -611.0957
## 9 -612.8846
## 10 -614.5760
## 11 -616.0602
## 12 -617.0743
## 13 -617.6423
## 14 -617.8429
## 15 -619.7000
## 16 -621.4722
Using stepwise approach, it provides the final predictors by removing several variables such as ‘FK_read.ease’, ‘review_scores_communication’, etc.
Let’s check the result from the stepwise approach.
model5 <- lm(price_adj ~ reviews_per_month + review_scores_rating +
review_scores_accuracy + review_scores_cleanliness + review_scores_value +
ops_year + acceptance_rate + host_total_listings_count +
formality + room_type + response_time + neighbourhood, data = df_reg_des3,
na.action = na.exclude)
imcdiag(model5)
##
## Call:
## imcdiag(mod = model5)
##
##
## All Individual Multicollinearity Diagnostics Result
##
## VIF TOL Wi Fi Leamer
## reviews_per_month 1.3853 0.7219 0.9476 0.9847 0.8496
## review_scores_rating 5.5382 0.1806 11.1614 11.5975 0.4249
## review_scores_accuracy 3.1873 0.3137 5.3795 5.5897 0.5601
## review_scores_cleanliness 4.1775 0.2394 7.8148 8.1202 0.4893
## review_scores_value 1.7924 0.5579 1.9489 2.0251 0.7469
## ops_year 1.9920 0.5020 2.4398 2.5351 0.7085
## acceptance_rate 1.6420 0.6090 1.5789 1.6406 0.7804
## host_total_listings_count 1.8513 0.5402 2.0938 2.1756 0.7350
## formality 1.3464 0.7427 0.8520 0.8853 0.8618
## room_typePrivate room 1.5649 0.6390 1.3894 1.4437 0.7994
## response_timewithin a day 1.4470 0.6911 1.0994 1.1423 0.8313
## response_timea few days or more 1.9875 0.5032 2.4286 2.5235 0.7093
## neighbourhoodavonmouth & lawrence weston 1.7163 0.5826 1.7617 1.8306 0.7633
## neighbourhoodbedminster 1.3272 0.7535 0.8047 0.8362 0.8680
## neighbourhoodbishopston & ashley down 1.2444 0.8036 0.6010 0.6245 0.8964
## neighbourhoodbrislington east 1.1331 0.8826 0.3273 0.3400 0.9394
## neighbourhoodbrislington west 1.1295 0.8853 0.3186 0.3310 0.9409
## neighbourhoodcentral 1.1968 0.8355 0.4841 0.5030 0.9141
## neighbourhoodclifton 1.4633 0.6834 1.1394 1.1839 0.8267
## neighbourhoodclifton down 1.4568 0.6864 1.1235 1.1674 0.8285
## neighbourhoodcotham 1.2544 0.7972 0.6257 0.6502 0.8929
## neighbourhoodeaston 1.3880 0.7205 0.9542 0.9915 0.8488
## neighbourhoodeastville 1.5891 0.6293 1.4488 1.5054 0.7933
## neighbourhoodfilwood 1.2438 0.8040 0.5996 0.6230 0.8967
## neighbourhoodfrome vale 1.1297 0.8852 0.3189 0.3314 0.9409
## neighbourhoodhenbury & brentry 1.1080 0.9025 0.2657 0.2760 0.9500
## neighbourhoodhorfield 1.2491 0.8006 0.6126 0.6366 0.8948
## neighbourhoodhotwells & harbourside 1.3549 0.7381 0.8729 0.9070 0.8591
## neighbourhoodknowle 1.2586 0.7945 0.6360 0.6609 0.8914
## neighbourhoodlawrence hill 1.3381 0.7473 0.8315 0.8640 0.8645
## neighbourhoodredland 1.2686 0.7883 0.6605 0.6863 0.8879
## neighbourhoodsouthmead 1.2407 0.8060 0.5920 0.6151 0.8978
## neighbourhoodsouthville 1.1576 0.8638 0.3877 0.4028 0.9294
## neighbourhoodst george troopers hill 1.2443 0.8037 0.6009 0.6243 0.8965
## neighbourhoodst george west 1.2596 0.7939 0.6386 0.6635 0.8910
## neighbourhoodstoke bishop 1.0719 0.9329 0.1768 0.1838 0.9659
## neighbourhoodwestbury-on-trym & henleaze 1.2730 0.7855 0.6715 0.6977 0.8863
## neighbourhoodwindmill hill 1.4778 0.6767 1.1752 1.2211 0.8226
## CVIF Klein IND1 IND2
## reviews_per_month 1.7728 0 0.2935 0.9290
## review_scores_rating 7.0873 1 0.0734 2.7371
## review_scores_accuracy 4.0788 1 0.1276 2.2922
## review_scores_cleanliness 5.3460 1 0.0973 2.5407
## review_scores_value 2.2938 0 0.2268 1.4767
## ops_year 2.5492 0 0.2041 1.6634
## acceptance_rate 2.1013 0 0.2476 1.3060
## host_total_listings_count 2.3692 0 0.2196 1.5360
## formality 1.7230 0 0.3020 0.8594
## room_typePrivate room 2.0027 0 0.2598 1.2058
## response_timewithin a day 1.8517 0 0.2810 1.0318
## response_timea few days or more 2.5434 0 0.2046 1.6596
## neighbourhoodavonmouth & lawrence weston 2.1964 0 0.2369 1.3941
## neighbourhoodbedminster 1.6984 0 0.3064 0.8235
## neighbourhoodbishopston & ashley down 1.5924 0 0.3267 0.6560
## neighbourhoodbrislington east 1.4500 0 0.3588 0.3923
## neighbourhoodbrislington west 1.4455 0 0.3600 0.3831
## neighbourhoodcentral 1.5316 0 0.3397 0.5494
## neighbourhoodclifton 1.8726 0 0.2779 1.0575
## neighbourhoodclifton down 1.8643 0 0.2791 1.0474
## neighbourhoodcotham 1.6053 0 0.3241 0.6775
## neighbourhoodeaston 1.7762 0 0.2929 0.9337
## neighbourhoodeastville 2.0336 0 0.2559 1.2382
## neighbourhoodfilwood 1.5917 0 0.3269 0.6547
## neighbourhoodfrome vale 1.4457 0 0.3599 0.3834
## neighbourhoodhenbury & brentry 1.4180 0 0.3670 0.3256
## neighbourhoodhorfield 1.5985 0 0.3255 0.6661
## neighbourhoodhotwells & harbourside 1.7339 0 0.3001 0.8749
## neighbourhoodknowle 1.6107 0 0.3231 0.6863
## neighbourhoodlawrence hill 1.7124 0 0.3039 0.8440
## neighbourhoodredland 1.6234 0 0.3205 0.7072
## neighbourhoodsouthmead 1.5877 0 0.3277 0.6480
## neighbourhoodsouthville 1.4815 0 0.3512 0.4548
## neighbourhoodst george troopers hill 1.5924 0 0.3268 0.6558
## neighbourhoodst george west 1.6120 0 0.3228 0.6885
## neighbourhoodstoke bishop 1.3717 0 0.3793 0.2241
## neighbourhoodwestbury-on-trym & henleaze 1.6291 0 0.3194 0.7164
## neighbourhoodwindmill hill 1.8912 0 0.2751 1.0800
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## reviews_per_month , review_scores_rating , review_scores_accuracy , review_scores_cleanliness , review_scores_value , acceptance_rate , host_total_listings_count , formality , response_timewithin a day , neighbourhoodbedminster , neighbourhoodbishopston & ashley down , neighbourhoodbrislington west , neighbourhoodcentral , neighbourhoodeastville , neighbourhoodfilwood , neighbourhoodfrome vale , neighbourhoodhenbury & brentry , neighbourhoodhorfield , neighbourhoodhotwells & harbourside , neighbourhoodknowle , neighbourhoodlawrence hill , neighbourhoodredland , neighbourhoodsouthmead , neighbourhoodsouthville , neighbourhoodst george troopers hill , neighbourhoodst george west , neighbourhoodstoke bishop , neighbourhoodwestbury-on-trym & henleaze , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 0.6321
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================
model6 <- lm(price_adj ~ reviews_per_month +
review_scores_accuracy + review_scores_cleanliness + review_scores_value +
ops_year + acceptance_rate + host_total_listings_count +
formality + room_type + response_time + neighbourhood, data = df_reg_des3,
na.action = na.exclude)
# summary
model6 %>% tidy() %>% kable(
caption = "Coefficient Estimation for Rating Scores Prediction",
col.names = c("Predictor", "B", "SE", "t", "p"),
digits = c(0, 2, 2, 2, 2)
)
| Predictor | B | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -0.05 | 0.19 | -0.27 | 0.78 |
| reviews_per_month | -0.14 | 0.07 | -1.94 | 0.06 |
| review_scores_accuracy | 0.24 | 0.23 | 1.05 | 0.30 |
| review_scores_cleanliness | 0.14 | 0.10 | 1.47 | 0.15 |
| review_scores_value | -0.08 | 0.12 | -0.70 | 0.49 |
| ops_year | 0.12 | 0.05 | 2.34 | 0.02 |
| acceptance_rate | 0.06 | 0.03 | 1.80 | 0.08 |
| host_total_listings_count | 1.69 | 0.99 | 1.70 | 0.09 |
| formality | -0.05 | 0.05 | -1.00 | 0.32 |
| room_typePrivate room | -0.07 | 0.02 | -3.93 | 0.00 |
| response_timewithin a day | 0.02 | 0.02 | 0.92 | 0.36 |
| response_timea few days or more | 0.08 | 0.04 | 2.08 | 0.04 |
| neighbourhoodavonmouth & lawrence weston | -0.15 | 0.05 | -3.02 | 0.00 |
| neighbourhoodbedminster | 0.00 | 0.06 | -0.07 | 0.94 |
| neighbourhoodbishopston & ashley down | 0.00 | 0.04 | 0.02 | 0.98 |
| neighbourhoodbrislington east | -0.31 | 0.09 | -3.50 | 0.00 |
| neighbourhoodbrislington west | 0.04 | 0.09 | 0.46 | 0.65 |
| neighbourhoodcentral | 0.02 | 0.05 | 0.36 | 0.72 |
| neighbourhoodclifton | 0.14 | 0.05 | 3.07 | 0.00 |
| neighbourhoodclifton down | 0.13 | 0.04 | 2.89 | 0.00 |
| neighbourhoodcotham | 0.13 | 0.04 | 3.34 | 0.00 |
| neighbourhoodeaston | -0.12 | 0.04 | -2.75 | 0.01 |
| neighbourhoodeastville | 0.00 | 0.03 | -0.16 | 0.87 |
| neighbourhoodfilwood | -0.05 | 0.07 | -0.81 | 0.42 |
| neighbourhoodfrome vale | -0.09 | 0.09 | -1.04 | 0.30 |
| neighbourhoodhenbury & brentry | -0.06 | 0.09 | -0.67 | 0.51 |
| neighbourhoodhorfield | -0.07 | 0.09 | -0.77 | 0.44 |
| neighbourhoodhotwells & harbourside | 0.07 | 0.04 | 1.96 | 0.05 |
| neighbourhoodknowle | -0.01 | 0.07 | -0.11 | 0.91 |
| neighbourhoodlawrence hill | 0.00 | 0.04 | 0.03 | 0.98 |
| neighbourhoodredland | 0.07 | 0.04 | 1.69 | 0.09 |
| neighbourhoodsouthmead | 0.01 | 0.09 | 0.06 | 0.95 |
| neighbourhoodsouthville | 0.08 | 0.05 | 1.61 | 0.11 |
| neighbourhoodst george troopers hill | 0.07 | 0.07 | 1.10 | 0.28 |
| neighbourhoodst george west | -0.13 | 0.09 | -1.38 | 0.17 |
| neighbourhoodstoke bishop | 0.07 | 0.09 | 0.85 | 0.40 |
| neighbourhoodwestbury-on-trym & henleaze | 0.05 | 0.05 | 0.95 | 0.35 |
| neighbourhoodwindmill hill | -0.08 | 0.03 | -2.48 | 0.01 |
# check multicollinearity
imcdiag(model6)
##
## Call:
## imcdiag(mod = model6)
##
##
## All Individual Multicollinearity Diagnostics Result
##
## VIF TOL Wi Fi Leamer
## reviews_per_month 1.3114 0.7626 0.7957 0.8273 0.8733
## review_scores_accuracy 2.6398 0.3788 4.1905 4.3571 0.6155
## review_scores_cleanliness 2.4733 0.4043 3.7651 3.9148 0.6359
## review_scores_value 1.7785 0.5623 1.9895 2.0686 0.7498
## ops_year 1.9891 0.5027 2.5278 2.6283 0.7090
## acceptance_rate 1.6391 0.6101 1.6333 1.6982 0.7811
## host_total_listings_count 1.7952 0.5571 2.0321 2.1128 0.7464
## formality 1.3062 0.7656 0.7825 0.8136 0.8750
## room_typePrivate room 1.5435 0.6479 1.3890 1.4442 0.8049
## response_timewithin a day 1.4234 0.7025 1.0820 1.1250 0.8382
## response_timea few days or more 1.9293 0.5183 2.3750 2.4694 0.7199
## neighbourhoodavonmouth & lawrence weston 1.7162 0.5827 1.8304 1.9032 0.7633
## neighbourhoodbedminster 1.3155 0.7602 0.8062 0.8383 0.8719
## neighbourhoodbishopston & ashley down 1.2405 0.8061 0.6147 0.6392 0.8978
## neighbourhoodbrislington east 1.1274 0.8870 0.3256 0.3386 0.9418
## neighbourhoodbrislington west 1.1295 0.8853 0.3310 0.3441 0.9409
## neighbourhoodcentral 1.1906 0.8399 0.4871 0.5065 0.9165
## neighbourhoodclifton 1.4520 0.6887 1.1552 1.2011 0.8299
## neighbourhoodclifton down 1.3724 0.7286 0.9518 0.9896 0.8536
## neighbourhoodcotham 1.2541 0.7974 0.6493 0.6751 0.8930
## neighbourhoodeaston 1.3324 0.7505 0.8494 0.8832 0.8663
## neighbourhoodeastville 1.5879 0.6298 1.5024 1.5621 0.7936
## neighbourhoodfilwood 1.2423 0.8050 0.6192 0.6438 0.8972
## neighbourhoodfrome vale 1.1286 0.8861 0.3286 0.3417 0.9413
## neighbourhoodhenbury & brentry 1.1080 0.9025 0.2760 0.2869 0.9500
## neighbourhoodhorfield 1.2373 0.8082 0.6065 0.6306 0.8990
## neighbourhoodhotwells & harbourside 1.3547 0.7382 0.9063 0.9424 0.8592
## neighbourhoodknowle 1.2370 0.8084 0.6057 0.6297 0.8991
## neighbourhoodlawrence hill 1.3164 0.7596 0.8086 0.8408 0.8716
## neighbourhoodredland 1.2686 0.7883 0.6863 0.7136 0.8879
## neighbourhoodsouthmead 1.2389 0.8072 0.6105 0.6348 0.8984
## neighbourhoodsouthville 1.1576 0.8638 0.4028 0.4188 0.9294
## neighbourhoodst george troopers hill 1.2339 0.8105 0.5977 0.6215 0.9003
## neighbourhoodst george west 1.2590 0.7943 0.6619 0.6882 0.8912
## neighbourhoodstoke bishop 1.0686 0.9358 0.1754 0.1823 0.9674
## neighbourhoodwestbury-on-trym & henleaze 1.2725 0.7859 0.6964 0.7241 0.8865
## neighbourhoodwindmill hill 1.4668 0.6818 1.1929 1.2403 0.8257
## CVIF Klein IND1 IND2
## reviews_per_month 1.6506 0 0.2984 0.8736
## review_scores_accuracy 3.3227 0 0.1482 2.2855
## review_scores_cleanliness 3.1131 0 0.1582 2.1917
## review_scores_value 2.2386 0 0.2200 1.6106
## ops_year 2.5037 0 0.1967 1.8296
## acceptance_rate 2.0631 0 0.2387 1.4346
## host_total_listings_count 2.2595 0 0.2180 1.6297
## formality 1.6441 0 0.2996 0.8625
## room_typePrivate room 1.9428 0 0.2535 1.2956
## response_timewithin a day 1.7916 0 0.2749 1.0944
## response_timea few days or more 2.4285 0 0.2028 1.7723
## neighbourhoodavonmouth & lawrence weston 2.1602 0 0.2280 1.5355
## neighbourhoodbedminster 1.6558 0 0.2975 0.8824
## neighbourhoodbishopston & ashley down 1.5615 0 0.3154 0.7134
## neighbourhoodbrislington east 1.4191 0 0.3471 0.4158
## neighbourhoodbrislington west 1.4217 0 0.3464 0.4219
## neighbourhoodcentral 1.4986 0 0.3287 0.5890
## neighbourhoodclifton 1.8277 0 0.2695 1.1454
## neighbourhoodclifton down 1.7275 0 0.2851 0.9984
## neighbourhoodcotham 1.5785 0 0.3120 0.7454
## neighbourhoodeaston 1.6770 0 0.2937 0.9178
## neighbourhoodeastville 1.9987 0 0.2464 1.3622
## neighbourhoodfilwood 1.5636 0 0.3150 0.7176
## neighbourhoodfrome vale 1.4206 0 0.3467 0.4192
## neighbourhoodhenbury & brentry 1.3946 0 0.3532 0.3586
## neighbourhoodhorfield 1.5574 0 0.3162 0.7058
## neighbourhoodhotwells & harbourside 1.7051 0 0.2889 0.9633
## neighbourhoodknowle 1.5570 0 0.3163 0.7049
## neighbourhoodlawrence hill 1.6570 0 0.2973 0.8844
## neighbourhoodredland 1.5967 0 0.3085 0.7789
## neighbourhoodsouthmead 1.5594 0 0.3159 0.7095
## neighbourhoodsouthville 1.4571 0 0.3380 0.5010
## neighbourhoodst george troopers hill 1.5531 0 0.3171 0.6974
## neighbourhoodst george west 1.5847 0 0.3108 0.7569
## neighbourhoodstoke bishop 1.3451 0 0.3662 0.2363
## neighbourhoodwestbury-on-trym & henleaze 1.6017 0 0.3075 0.7879
## neighbourhoodwindmill hill 1.8462 0 0.2668 1.1709
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
##
## reviews_per_month , review_scores_accuracy , review_scores_cleanliness , review_scores_value , acceptance_rate , host_total_listings_count , formality , response_timewithin a day , neighbourhoodbedminster , neighbourhoodbishopston & ashley down , neighbourhoodbrislington west , neighbourhoodcentral , neighbourhoodeastville , neighbourhoodfilwood , neighbourhoodfrome vale , neighbourhoodhenbury & brentry , neighbourhoodhorfield , neighbourhoodhotwells & harbourside , neighbourhoodknowle , neighbourhoodlawrence hill , neighbourhoodredland , neighbourhoodsouthmead , neighbourhoodsouthville , neighbourhoodst george troopers hill , neighbourhoodst george west , neighbourhoodstoke bishop , neighbourhoodwestbury-on-trym & henleaze , coefficient(s) are non-significant may be due to multicollinearity
##
## R-square of y on all x: 0.6294
##
## * use method argument to check which regressors may be the reason of collinearity
## ===================================